分类
未分类

linux 内核poll机制分析

poll和select算是姐妹篇吧。poll没有 select那么多限制。但是就代码上看,poll每次用户态和内核态传递的数据比select更多,事件机制没有显著变化。我猜poll是后来epoll机制的先导。
系统调用接口

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                int, timeout_msecs)
{
        struct timespec64 end_time, *to = NULL;
        int ret;

        if (timeout_msecs >= 0) {
                to = &end_time;
                poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
                        NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));        //超时时间设置
        }

        ret = do_sys_poll(ufds, nfds, to);              //干活函数。

        if (ret == -EINTR) {
                struct restart_block *restart_block;

                restart_block = &current->restart_block;
                restart_block->fn = do_restart_poll;
                restart_block->poll.ufds = ufds;
                restart_block->poll.nfds = nfds;

                if (timeout_msecs >= 0) {
                        restart_block->poll.tv_sec = end_time.tv_sec;
                        restart_block->poll.tv_nsec = end_time.tv_nsec;
                        restart_block->poll.has_timeout = 1;
                } else
                        restart_block->poll.has_timeout = 0;

                ret = -ERESTART_RESTARTBLOCK;
        }
        return ret;
}

do_sys_poll函数,这个函数也是预处理函数,将用户态的数据拷贝到内核,然后调用do_poll函数工作,等do_poll返回以后再将处理完的数据拷贝到用户态。比较有意思时,它采用的块处理,每个walk块一个page。当最后的块不够page时,只分配块那么长的内存。代码同样非常简单。稍微看看就懂,不列出了。

do_poll函数,这个函数,是真正干活的函数,由于是select的姐妹篇,所以在超时设置,句柄查询方式方面跟select没有区别不废话,直接上代码。

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
                   struct timespec64 *end_time)
{
        poll_table* pt = &wait->pt;
        ktime_t expire, *to = NULL;
        int timed_out = 0, count = 0;
        u64 slack = 0;
        unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;   //是否忙查
        unsigned long busy_start = 0;

        /* Optimise the no-wait case */
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                pt->_qproc = NULL;
                timed_out = 1;
        }

        if (end_time && !timed_out)
                slack = select_estimate_accuracy(end_time);     //超时处理

        for (;;) {      //外层循环,退出机制跟select一样,超时,文件句柄有事件,有信号
                struct poll_list *walk;
                bool can_busy_loop = false;

                for (walk = list; walk != NULL; walk = walk->next) {    //第二层训话,按句柄块处理
                        struct pollfd * pfd, * pfd_end;

                        pfd = walk->entries;
                        pfd_end = pfd + walk->len;
                        for (; pfd != pfd_end; pfd++) {     //单个句柄处理
                                /*
                                 * Fish for events. If we found one, record it
                                 * and kill poll_table->_qproc, so we don't
                                 * needlessly register any other waiters after
                                 * this. They'll get immediately deregistered
                                 * when we break out and return.
                                 */
                                if (do_pollfd(pfd, pt, &can_busy_loop,              //查询句柄是否有事件的函数
                                              busy_flag)) {
                                        count++;
                                        pt->_qproc = NULL;
                                        /* found something, stop busy polling */
                                        busy_flag = 0;
                                        can_busy_loop = false;
                                }
                        }
                }
                /*
                 * All waiters have already been registered, so don't provide
                 * a poll_table->_qproc to them on the next loop iteration.
                 */
                pt->_qproc = NULL;
                if (!count) {
                        count = wait->error;
                        if (signal_pending(current))
                                count = -EINTR;
                }
                if (count || timed_out)
                        break;

                /* only if found POLL_BUSY_LOOP sockets && not out of time */
                if (can_busy_loop && !need_resched()) { //跟select一致
                        if (!busy_start) {
                                busy_start = busy_loop_current_time();
                                continue;
                        }
                        if (!busy_loop_timeout(busy_start))
                                continue;
                }
                busy_flag = 0;

                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
                 * pointer to the expiry value.
                 */
                if (end_time && !to) {
                        expire = timespec64_to_ktime(*end_time);
                        to = &expire;
                }

                if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))    //跟select一致。
                        timed_out = 1;
        }
        return count;
}

do_pollfd函数。查询单个文件句柄是否有事件,上代码。

static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
                                     bool *can_busy_poll,
                                     unsigned int busy_flag)
{
        unsigned int mask;
        int fd;

        mask = 0;
        fd = pollfd->fd;            //获取文件句柄
        if (fd >= 0) {
                struct fd f = fdget(fd);    //获取句柄对应的文件
                mask = POLLNVAL;
                if (f.file) {
                        mask = DEFAULT_POLLMASK;
                        if (f.file->f_op->poll) {
                                pwait->_key = pollfd->events|POLLERR|POLLHUP;
                                pwait->_key |= busy_flag;
                                mask = f.file->f_op->poll(f.file, pwait);       //调用文件对应的poll函数查询状态
                                if (mask & busy_flag)
                                        *can_busy_poll = true;
                        }
                        /* Mask out unneeded events. */
                        mask &= pollfd->events | POLLERR | POLLHUP;         //出去不需要的状态,查询需要的状态。
                        fdput(f);
                }
        }
        pollfd->revents = mask;             //保存查询的状态

        return mask;        //返回查询的状态
}

poll函数比select函数稍微改进是,poll函数没有文件句柄数量的限制,且poll函数也不会只能监听奇葩的句柄号是1024以下的文件。但是poll并不完美,每次调用poll函数都需要将所有的文件句柄下传到内核态。即使这次监听只有一个文件有事件,下次监听时也需要将所有文件句柄都下传,而且不只是下传哦,当有结果的时候内核还要上传的哦,即使只有一个文件有事件,也需要把所有状态都上传。当句柄足够多的时候,poll函数本身非常消耗cpu。因为它在内核时需要挨个轮询,当返回用户态以后还需要挨个轮询,两遍轮询存在性能瓶颈。时效性也有问题,如果出现事件的句柄在下传句柄的最后,则有可能消息处理不及时。鉴于poll函数有如此多的缺陷。开源社区的老大们开发了epoll系统调用。epoll几乎完美的解决了上述问题。因此在高负载情况下,大多数都是用epoll系统调用。

发表评论

电子邮件地址不会被公开。 必填项已用*标注