Ipset walk
1. Init of the module:
hash_mac_init(void)
|
|-> ip_set_type_register(&hash_mac_type);
|
|-> list_add_rcu(&hash_mac_type->list, &ip_set_type_list);
2. static struct ip_set_type hash_mac_type __read_mostly = {
.name = "hash:mac",
.protocol = IPSET_PROTOCOL,
.features = IPSET_TYPE_MAC,
.dimension = IPSET_DIM_ONE,
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
.create = hash_mac_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
[IPSET_ATTR_PROBES] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_ETHER] = { .type = NLA_BINARY,
.len = ETH_ALEN },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
[IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
.len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
},
.me = THIS_MODULE,
};
3.
1.What is the purpose of HTYPE, it is not getting used further.
/* Type specific function prefix */
#define HTYPE hash_mac
2. What is the purpose of MTYPE.
#define MTYPE hash_mac4
Most of the functions are generated and the macros are used in the
included ip_set_hash_gen.h file. HTYPE is used for the IPv4/IPv6
independent functions while MTYPE required for the IPv4, IPv6 specific
ones.
3. Why 2 different functions are used. KADT and UADT. I understand ADT
is Add delete, Test and K and U might be Kernel space and User space.
But what is different operation of KADT and UADT.
Those are the kernel and userspace common routines for add, del and test
elements. The functions are separated because the input comes in different
forms: in the userspace case in netlink attributes, while in the kernel
case through the API.
4. Who calls Kadt in kernel space. How is UADT called from User space.
The kadt functions are called from the kernel part interfaces (ip_set_add,
etc) while the uadt ones from ip_set_uadd, etc (see ip_set_core.c). Please
note uadt functions are NOT called in userspace: the functions work on
data sent FROM userspace.
4.
Userspace command: ipset
Function ipset_cmd(session, cmd, restore_line);
/**
* ipset_cmd - execute a command
* @session: session structure
* @cmd: command to execute
* @lineno: command line number in restore mode
*
* Execute - or prepare/buffer in restore mode - a command.
* It is the caller responsibility that the data field be filled out
* with all required parameters for a successful execution.
* The data field is cleared after this function call for the public
* commands.
*
* Returns 0 on success or a negative error code.
*/
int ipset_cmd(struct ipset_session *session, enum ipset_cmd cmd, uint32_t lineno)
{
struct ipset_data *data;
bool aggregate = false;
int ret = -1;
assert(session);
if (cmd <= IPSET_CMD_NONE || cmd >= IPSET_MSG_MAX)
return 0;
/* Initialize transport method if not done yet */
if (session->handle == NULL && init_transport(session) == NULL)
return ipset_err(session,
"Cannot open session to kernel.");
data = session->data;
/* Check protocol version once */
if (!session->version_checked) {
if (build_send_private_msg(session, IPSET_CMD_PROTOCOL) < 0)
return -1;
}
/* Private commands */
if (cmd == IPSET_CMD_TYPE || cmd == IPSET_CMD_HEADER)
return build_send_private_msg(session, cmd);
/* Check aggregatable commands */
aggregate = may_aggregate_ad(session, cmd);
if (!aggregate) {
/* Flush possible aggregated commands */
ret = ipset_commit(session);
if (ret < 0)
return ret;
}
/* Real command: update lineno too */
session->cmd = cmd;
session->lineno = lineno;
/* Set default output mode */
if (cmd == IPSET_CMD_LIST) {
if (session->mode == IPSET_LIST_NONE)
session->mode = IPSET_LIST_PLAIN;
} else if (cmd == IPSET_CMD_SAVE) {
if (session->mode == IPSET_LIST_NONE)
session->mode = IPSET_LIST_SAVE;
}
/* Start the root element in XML mode */
if ((cmd == IPSET_CMD_LIST || cmd == IPSET_CMD_SAVE) &&
session->mode == IPSET_LIST_XML)
safe_snprintf(session, "\n");
D("next: build_msg");
/* Build new message or append buffered commands */
ret = build_msg(session, aggregate);
D("build_msg returned %u", ret);
if (ret > 0) {
/* Buffer is full, send buffered commands */
ret = ipset_commit(session);
if (ret < 0)
goto cleanup;
ret = build_msg(session, false);
D("build_msg 2 returned %u", ret);
}
if (ret < 0)
goto cleanup;
D("past: build_msg");
/* We have to save the type for error handling */
session->saved_type = ipset_data_get(data, IPSET_OPT_TYPE);
if (session->lineno != 0 &&
(cmd == IPSET_CMD_ADD || cmd == IPSET_CMD_DEL)) {
/* Save setname for the next possible aggregated restore line */
strcpy(session->saved_setname, ipset_data_setname(data));
ipset_data_reset(data);
/* Don't commit: we may aggregate next command */
ret = 0;
goto cleanup;
}
D("call commit");
ret = ipset_commit(session);
cleanup:
D("reset data");
ipset_data_reset(data);
return ret;
}
hash_mac_init(void)
|
|-> ip_set_type_register(&hash_mac_type);
|
|-> list_add_rcu(&hash_mac_type->list, &ip_set_type_list);
2. static struct ip_set_type hash_mac_type __read_mostly = {
.name = "hash:mac",
.protocol = IPSET_PROTOCOL,
.features = IPSET_TYPE_MAC,
.dimension = IPSET_DIM_ONE,
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
.create = hash_mac_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
[IPSET_ATTR_PROBES] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_ETHER] = { .type = NLA_BINARY,
.len = ETH_ALEN },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
[IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING,
.len = IPSET_MAX_COMMENT_SIZE },
[IPSET_ATTR_SKBMARK] = { .type = NLA_U64 },
[IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 },
[IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 },
},
.me = THIS_MODULE,
};
3.
1.What is the purpose of HTYPE, it is not getting used further.
/* Type specific function prefix */
#define HTYPE hash_mac
2. What is the purpose of MTYPE.
#define MTYPE hash_mac4
Most of the functions are generated and the macros are used in the
included ip_set_hash_gen.h file. HTYPE is used for the IPv4/IPv6
independent functions while MTYPE required for the IPv4, IPv6 specific
ones.
3. Why 2 different functions are used. KADT and UADT. I understand ADT
is Add delete, Test and K and U might be Kernel space and User space.
But what is different operation of KADT and UADT.
Those are the kernel and userspace common routines for add, del and test
elements. The functions are separated because the input comes in different
forms: in the userspace case in netlink attributes, while in the kernel
case through the API.
4. Who calls Kadt in kernel space. How is UADT called from User space.
The kadt functions are called from the kernel part interfaces (ip_set_add,
etc) while the uadt ones from ip_set_uadd, etc (see ip_set_core.c). Please
note uadt functions are NOT called in userspace: the functions work on
data sent FROM userspace.
4.
Userspace command: ipset
Function ipset_cmd(session, cmd, restore_line);
/**
* ipset_cmd - execute a command
* @session: session structure
* @cmd: command to execute
* @lineno: command line number in restore mode
*
* Execute - or prepare/buffer in restore mode - a command.
* It is the caller responsibility that the data field be filled out
* with all required parameters for a successful execution.
* The data field is cleared after this function call for the public
* commands.
*
* Returns 0 on success or a negative error code.
*/
int ipset_cmd(struct ipset_session *session, enum ipset_cmd cmd, uint32_t lineno)
{
struct ipset_data *data;
bool aggregate = false;
int ret = -1;
assert(session);
if (cmd <= IPSET_CMD_NONE || cmd >= IPSET_MSG_MAX)
return 0;
/* Initialize transport method if not done yet */
if (session->handle == NULL && init_transport(session) == NULL)
return ipset_err(session,
"Cannot open session to kernel.");
data = session->data;
/* Check protocol version once */
if (!session->version_checked) {
if (build_send_private_msg(session, IPSET_CMD_PROTOCOL) < 0)
return -1;
}
/* Private commands */
if (cmd == IPSET_CMD_TYPE || cmd == IPSET_CMD_HEADER)
return build_send_private_msg(session, cmd);
/* Check aggregatable commands */
aggregate = may_aggregate_ad(session, cmd);
if (!aggregate) {
/* Flush possible aggregated commands */
ret = ipset_commit(session);
if (ret < 0)
return ret;
}
/* Real command: update lineno too */
session->cmd = cmd;
session->lineno = lineno;
/* Set default output mode */
if (cmd == IPSET_CMD_LIST) {
if (session->mode == IPSET_LIST_NONE)
session->mode = IPSET_LIST_PLAIN;
} else if (cmd == IPSET_CMD_SAVE) {
if (session->mode == IPSET_LIST_NONE)
session->mode = IPSET_LIST_SAVE;
}
/* Start the root element in XML mode */
if ((cmd == IPSET_CMD_LIST || cmd == IPSET_CMD_SAVE) &&
session->mode == IPSET_LIST_XML)
safe_snprintf(session, "
D("next: build_msg");
/* Build new message or append buffered commands */
ret = build_msg(session, aggregate);
D("build_msg returned %u", ret);
if (ret > 0) {
/* Buffer is full, send buffered commands */
ret = ipset_commit(session);
if (ret < 0)
goto cleanup;
ret = build_msg(session, false);
D("build_msg 2 returned %u", ret);
}
if (ret < 0)
goto cleanup;
D("past: build_msg");
/* We have to save the type for error handling */
session->saved_type = ipset_data_get(data, IPSET_OPT_TYPE);
if (session->lineno != 0 &&
(cmd == IPSET_CMD_ADD || cmd == IPSET_CMD_DEL)) {
/* Save setname for the next possible aggregated restore line */
strcpy(session->saved_setname, ipset_data_setname(data));
ipset_data_reset(data);
/* Don't commit: we may aggregate next command */
ret = 0;
goto cleanup;
}
D("call commit");
ret = ipset_commit(session);
cleanup:
D("reset data");
ipset_data_reset(data);
return ret;
}
#define NFNL_SUBSYS_IPSET 6
static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = { 1875 .name = "ip_set", 1876 .subsys_id = NFNL_SUBSYS_IPSET, 1877 .cb_count = IPSET_MSG_MAX, 1878 .cb = ip_set_netlink_subsys_cb, 1879 };
2000 static struct nf_sockopt_ops so_set __read_mostly = { 2001 .pf = PF_INET, 2002 .get_optmin = SO_IP_SET, 2003 .get_optmax = SO_IP_SET + 1, 2004 .get = &ip_set_sockfn_get, 2005 .owner = THIS_MODULE, 2006 };
static int __init 2055 ip_set_init(void) 2056 { 2057 int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
2063 ret = nf_register_sockopt(&so_set); 2064 if (ret != 0) { 2065 pr_err("SO_SET registry failed: %d\n", ret); 2066 nfnetlink_subsys_unregister(&ip_set_netlink_subsys); 2067 return ret; 2068 }
}
1802 static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { 1803 [IPSET_CMD_NONE] = { 1804 .call = ip_set_none, 1805 .attr_count = IPSET_ATTR_CMD_MAX, 1806 }, 1807 [IPSET_CMD_CREATE] = { 1808 .call = ip_set_create, 1809 .attr_count = IPSET_ATTR_CMD_MAX, 1810 .policy = ip_set_create_policy, 1811 }, 1812 [IPSET_CMD_DESTROY] = { 1813 .call = ip_set_destroy, 1814 .attr_count = IPSET_ATTR_CMD_MAX, 1815 .policy = ip_set_setname_policy, 1816 }, 1817 [IPSET_CMD_FLUSH] = { 1818 .call = ip_set_flush, 1819 .attr_count = IPSET_ATTR_CMD_MAX, 1820 .policy = ip_set_setname_policy, 1821 }, 1822 [IPSET_CMD_RENAME] = { 1823 .call = ip_set_rename, 1824 .attr_count = IPSET_ATTR_CMD_MAX, 1825 .policy = ip_set_setname2_policy, 1826 }, 1827 [IPSET_CMD_SWAP] = { 1828 .call = ip_set_swap, 1829 .attr_count = IPSET_ATTR_CMD_MAX, 1830 .policy = ip_set_setname2_policy, 1831 }, 1832 [IPSET_CMD_LIST] = { 1833 .call = ip_set_dump, 1834 .attr_count = IPSET_ATTR_CMD_MAX, 1835 .policy = ip_set_setname_policy, 1836 }, 1837 [IPSET_CMD_SAVE] = { 1838 .call = ip_set_dump, 1839 .attr_count = IPSET_ATTR_CMD_MAX, 1840 .policy = ip_set_setname_policy, 1841 }, 1842 [IPSET_CMD_ADD] = { 1843 .call = ip_set_uadd, 1844 .attr_count = IPSET_ATTR_CMD_MAX, 1845 .policy = ip_set_adt_policy, 1846 }, 1847 [IPSET_CMD_DEL] = { 1848 .call = ip_set_udel, 1849 .attr_count = IPSET_ATTR_CMD_MAX, 1850 .policy = ip_set_adt_policy, 1851 }, 1852 [IPSET_CMD_TEST] = { 1853 .call = ip_set_utest, 1854 .attr_count = IPSET_ATTR_CMD_MAX, 1855 .policy = ip_set_adt_policy, 1856 }, 1857 [IPSET_CMD_HEADER] = { 1858 .call = ip_set_header, 1859 .attr_count = IPSET_ATTR_CMD_MAX, 1860 .policy = ip_set_setname_policy, 1861 }, 1862 [IPSET_CMD_TYPE] = { 1863 .call = ip_set_type, 1864 .attr_count = IPSET_ATTR_CMD_MAX, 1865 .policy = ip_set_type_policy, 1866 }, 1867 [IPSET_CMD_PROTOCOL] = { 1868 .call = ip_set_protocol, 1869 .attr_count = IPSET_ATTR_CMD_MAX, 1870 .policy = ip_set_protocol_policy, 1871 }, 1872 };
1881 /* Interface to iptables/ip6tables */ 1882 1883 static int 1884 ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) 1885 { 1886 unsigned int *op; 1887 void *data; 1888 int copylen = *len, ret = 0; 1889 struct net *net = sock_net(sk); 1890 struct ip_set_net *inst = ip_set_pernet(net); 1891 1892 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 1893 return -EPERM; 1894 if (optval != SO_IP_SET) 1895 return -EBADF; 1896 if (*len < sizeof(unsigned int)) 1897 return -EINVAL; 1898 1899 data = vmalloc(*len); 1900 if (!data) 1901 return -ENOMEM; 1902 if (copy_from_user(data, user, *len) != 0) { 1903 ret = -EFAULT; 1904 goto done; 1905 } 1906 op = (unsigned int *)data; 1907 1908 if (*op < IP_SET_OP_VERSION) { 1909 /* Check the version at the beginning of operations */ 1910 struct ip_set_req_version *req_version = data; 1911 1912 if (*len < sizeof(struct ip_set_req_version)) { 1913 ret = -EINVAL; 1914 goto done; 1915 } 1916 1917 if (req_version->version != IPSET_PROTOCOL) { 1918 ret = -EPROTO; 1919 goto done; 1920 } 1921 } 1922 1923 switch (*op) { 1924 case IP_SET_OP_VERSION: { 1925 struct ip_set_req_version *req_version = data; 1926 1927 if (*len != sizeof(struct ip_set_req_version)) { 1928 ret = -EINVAL; 1929 goto done; 1930 } 1931 1932 req_version->version = IPSET_PROTOCOL; 1933 ret = copy_to_user(user, req_version, 1934 sizeof(struct ip_set_req_version)); 1935 goto done; 1936 } 1937 case IP_SET_OP_GET_BYNAME: { 1938 struct ip_set_req_get_set *req_get = data; 1939 ip_set_id_t id; 1940 1941 if (*len != sizeof(struct ip_set_req_get_set)) { 1942 ret = -EINVAL; 1943 goto done; 1944 } 1945 req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; 1946 nfnl_lock(NFNL_SUBSYS_IPSET); 1947 find_set_and_id(inst, req_get->set.name, &id); 1948 req_get->set.index = id; 1949 nfnl_unlock(NFNL_SUBSYS_IPSET); 1950 goto copy; 1951 } 1952 case IP_SET_OP_GET_FNAME: { 1953 struct ip_set_req_get_set_family *req_get = data; 1954 ip_set_id_t id; 1955 1956 if (*len != sizeof(struct ip_set_req_get_set_family)) { 1957 ret = -EINVAL; 1958 goto done; 1959 } 1960 req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; 1961 nfnl_lock(NFNL_SUBSYS_IPSET); 1962 find_set_and_id(inst, req_get->set.name, &id); 1963 req_get->set.index = id; 1964 if (id != IPSET_INVALID_ID) 1965 req_get->family = ip_set(inst, id)->family; 1966 nfnl_unlock(NFNL_SUBSYS_IPSET); 1967 goto copy; 1968 } 1969 case IP_SET_OP_GET_BYINDEX: { 1970 struct ip_set_req_get_set *req_get = data; 1971 struct ip_set *set; 1972 1973 if (*len != sizeof(struct ip_set_req_get_set) || 1974 req_get->set.index >= inst->ip_set_max) { 1975 ret = -EINVAL; 1976 goto done; 1977 } 1978 nfnl_lock(NFNL_SUBSYS_IPSET); 1979 set = ip_set(inst, req_get->set.index); 1980 strncpy(req_get->set.name, set ? set->name : "", 1981 IPSET_MAXNAMELEN); 1982 nfnl_unlock(NFNL_SUBSYS_IPSET); 1983 goto copy; 1984 } 1985 default: 1986 ret = -EBADMSG; 1987 goto done; 1988 } /* end of switch(op) */ 1989 1990 copy: 1991 ret = copy_to_user(user, data, copylen); 1992 1993 done: 1994 vfree(data); 1995 if (ret > 0) 1996 ret = 0; 1997 return ret; 1998 }
What happens now when we add a Rule that which makes it match with IPSET, set match set target.
From iptables, ipset is used via the set match and SET target. The corresponding kernel module (net/netfilter/xt_set.c) calls the ipset kernel API functions: ip_set_test(), ip_set_add(), ip_set_del().