Commit c319b4d7 authored by Vasiliy Kulikov's avatar Vasiliy Kulikov Committed by David S. Miller

net: ipv4: add IPPROTO_ICMP socket kind

This patch adds IPPROTO_ICMP socket kind.  It makes it possible to send
ICMP_ECHO messages and receive the corresponding ICMP_ECHOREPLY messages
without any special privileges.  In other words, the patch makes it
possible to implement setuid-less and CAP_NET_RAW-less /bin/ping.  In
order not to increase the kernel's attack surface, the new functionality
is disabled by default, but is enabled at bootup by supporting Linux
distributions, optionally with restriction to a group or a group range
(see below).

Similar functionality is implemented in Mac OS X:

A new ping socket is created with


Message identifiers (octets 4-5 of ICMP header) are interpreted as local
ports. Addresses are stored in struct sockaddr_in. No port numbers are
reserved for privileged processes, port 0 is reserved for API ("let the
kernel pick a free number"). There is no notion of remote ports, remote
port numbers provided by the user (e.g. in connect()) are ignored.

Data sent and received include ICMP headers. This is deliberate to:
1) Avoid the need to transport headers values like sequence numbers by
other means.
2) Make it easier to port existing programs using raw sockets.

ICMP headers given to send() are checked and sanitized. The type must be
ICMP_ECHO and the code must be zero (future extensions might relax this,
see below). The id is set to the number (local port) of the socket, the
checksum is always recomputed.

ICMP reply packets received from the network are demultiplexed according
to their id's, and are returned by recv() without any modifications.
IP header information and ICMP errors of those packets may be obtained
via ancillary data (IP_RECVTTL, IP_RETOPTS, and IP_RECVERR). ICMP source
quenches and redirects are reported as fake errors via the error queue
(IP_RECVERR); the next hop address for redirects is saved to ee_info (in
network order).

socket(2) is restricted to the group range specified in
"/proc/sys/net/ipv4/ping_group_range".  It is "1 0" by default, meaning
that nobody (not even root) may create ping sockets.  Setting it to "100
100" would grant permissions to the single group (to either make
/sbin/ping g+s and owned by this group or to grant permissions to the
"netadmins" group), "0 4294967295" would enable it for the world, "100
4294967295" would enable it for the users, but not daemons.

The existing code might be (in the unlikely case anyone needs it)
extended rather easily to handle other similar pairs of ICMP messages
(Timestamp/Reply, Information Request/Reply, Address Mask Request/Reply

Userspace ping util & patch for it:

For Openwall GNU/*/Linux it was the last step on the road to the
setuid-less distro.  A revision of this patch (for RHEL5/OpenVZ kernels)
is in use in Owl-current, such as in the 2011/03/12 LiveCD ISOs:

Initially this functionality was written by Pavel Kankovsky for
Linux 2.4.32, but unfortunately it was never made public.

All ping options (-b, -p, -Q, -R, -s, -t, -T, -M, -I), are tested with
the patch.

    - switched to flowi4.
    - minor changes to be consistent with raw sockets code.

    - changed ping_debug() to pr_debug().
    - removed CONFIG_IP_PING.
    - removed ping_seq_fops.owner field (unused for procfs).
    - switched to proc_net_fops_create().
    - switched to %pK in seq_printf().

    - fixed checksumming bug.
    - CAP_NET_RAW may not create icmp sockets anymore.

RFC v2:
    - minor cleanups.
    - introduced sysctl'able group range to restrict socket(2).
Signed-off-by: default avatarVasiliy Kulikov <>
Signed-off-by: default avatarDavid S. Miller <>
parent f2019030
......@@ -54,6 +54,8 @@ struct netns_ipv4 {
int sysctl_rt_cache_rebuild_count;
int current_rt_cache_rebuild_count;
unsigned int sysctl_ping_group_range[2];
atomic_t rt_genid;
atomic_t dev_addr_genid;
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
* Definitions for the "ping" module.
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
#ifndef _PING_H
#define _PING_H
#include <net/netns/hash.h>
/* PING_HTABLE_SIZE must be power of 2 */
#define ping_portaddr_for_each_entry(__sk, node, list) \
hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
* gid_t is either uint or ushort. We want to pass it to
* proc_dointvec_minmax(), so it must not be larger than MAX_INT
#define GID_T_MAX (((gid_t)~0U) >> 1)
struct ping_table {
struct hlist_nulls_head hash[PING_HTABLE_SIZE];
rwlock_t lock;
struct ping_iter_state {
struct seq_net_private p;
int bucket;
extern struct proto ping_prot;
extern void ping_rcv(struct sk_buff *);
extern void ping_err(struct sk_buff *, u32 info);
extern void inet_get_ping_group_range_net(struct net *net, unsigned int *low, unsigned int *high);
extern int __init ping_proc_init(void);
extern void ping_proc_exit(void);
void __init ping_init(void);
#endif /* _PING_H */
......@@ -11,7 +11,7 @@ obj-y := route.o inetpeer.o protocol.o \
datagram.o raw.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
inet_fragment.o ping.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
obj-$(CONFIG_PROC_FS) += proc.o
......@@ -105,6 +105,7 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/raw.h>
......@@ -1008,6 +1009,14 @@ static struct inet_protosw inetsw_array[] =
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.type = SOCK_RAW,
......@@ -1527,6 +1536,7 @@ static const struct net_protocol udp_protocol = {
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
.err_handler = ping_err,
.no_policy = 1,
.netns_ok = 1,
......@@ -1642,6 +1652,10 @@ static int __init inet_init(void)
if (rc)
goto out_unregister_udp_proto;
rc = proto_register(&ping_prot, 1);
if (rc)
goto out_unregister_raw_proto;
* Tell SOCKET that we are alive...
......@@ -1697,6 +1711,8 @@ static int __init inet_init(void)
/* Add UDP-Lite (RFC 3828) */
* Set the ICMP layer up
......@@ -1727,6 +1743,8 @@ static int __init inet_init(void)
rc = 0;
return rc;
......@@ -1751,11 +1769,15 @@ static int __init ipv4_proc_init(void)
goto out_tcp;
if (udp4_proc_init())
goto out_udp;
if (ping_proc_init())
goto out_ping;
if (ip_misc_proc_init())
goto out_misc;
return rc;
......@@ -83,6 +83,7 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <net/ping.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/errno.h>
......@@ -781,6 +782,15 @@ static void icmp_redirect(struct sk_buff *skb)
iph->saddr, skb->dev);
/* Ping wants to see redirects.
* Let's pretend they are errors of sorts... */
if (iph->protocol == IPPROTO_ICMP &&
iph->ihl >= 5 &&
pskb_may_pull(skb, (iph->ihl<<2)+8)) {
ping_err(skb, icmp_hdr(skb)->un.gateway);
......@@ -1041,7 +1051,7 @@ int icmp_rcv(struct sk_buff *skb)
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
.handler = icmp_discard,
.handler = ping_rcv,
[1] = {
.handler = icmp_discard,
This diff is collapsed.
......@@ -13,6 +13,7 @@
#include <linux/seqlock.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/nsproxy.h>
#include <net/snmp.h>
#include <net/icmp.h>
#include <net/ip.h>
......@@ -21,6 +22,7 @@
#include <net/udp.h>
#include <net/cipso_ipv4.h>
#include <net/inet_frag.h>
#include <net/ping.h>
static int zero;
static int tcp_retr1_max = 255;
......@@ -30,6 +32,8 @@ static int tcp_adv_win_scale_min = -31;
static int tcp_adv_win_scale_max = 31;
static int ip_ttl_min = 1;
static int ip_ttl_max = 255;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
/* Update system visible IP port range */
static void set_local_port_range(int range[2])
......@@ -68,6 +72,65 @@ static int ipv4_local_port_range(ctl_table *table, int write,
return ret;
void inet_get_ping_group_range_net(struct net *net, gid_t *low, gid_t *high)
gid_t *data = net->ipv4.sysctl_ping_group_range;
unsigned seq;
do {
seq = read_seqbegin(&sysctl_local_ports.lock);
*low = data[0];
*high = data[1];
} while (read_seqretry(&sysctl_local_ports.lock, seq));
void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
gid_t *data = table->data;
unsigned seq;
do {
seq = read_seqbegin(&sysctl_local_ports.lock);
*low = data[0];
*high = data[1];
} while (read_seqretry(&sysctl_local_ports.lock, seq));
/* Update system visible IP port range */
static void set_ping_group_range(struct ctl_table *table, int range[2])
gid_t *data = table->data;
data[0] = range[0];
data[1] = range[1];
/* Validate changes from /proc interface. */
static int ipv4_ping_group_range(ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
int ret;
gid_t range[2];
ctl_table tmp = {
.data = &range,
.maxlen = sizeof(range),
.mode = table->mode,
.extra1 = &ip_ping_group_range_min,
.extra2 = &ip_ping_group_range_max,
inet_get_ping_group_range_table(table, range, range + 1);
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && ret == 0)
set_ping_group_range(table, range);
return ret;
static int proc_tcp_congestion_control(ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
......@@ -677,6 +740,13 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
.procname = "ping_group_range",
.data = &init_net.ipv4.sysctl_ping_group_range,
.maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range),
.mode = 0644,
.proc_handler = ipv4_ping_group_range,
{ }
......@@ -711,8 +781,18 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
table[6].data =
table[7].data =
* Sane defaults - nobody may create ping sockets.
* Boot scripts should set this to distro-specific group.
net->ipv4.sysctl_ping_group_range[0] = 1;
net->ipv4.sysctl_ping_group_range[1] = 0;
net->ipv4.sysctl_rt_cache_rebuild_count = 4;
net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment