poll和select算是姐妹篇吧。poll没有 select那么多限制。但是就代码上看,poll每次用户态和内核态传递的数据比select更多,事件机制没有显著变化。我猜poll是后来epoll机制的先导。
系统调用接口
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
int, timeout_msecs)
{
struct timespec64 end_time, *to = NULL;
int ret;
if (timeout_msecs >= 0) {
to = &end_time;
poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); //超时时间设置
}
ret = do_sys_poll(ufds, nfds, to); //干活函数。
if (ret == -EINTR) {
struct restart_block *restart_block;
restart_block = ¤t->restart_block;
restart_block->fn = do_restart_poll;
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;
if (timeout_msecs >= 0) {
restart_block->poll.tv_sec = end_time.tv_sec;
restart_block->poll.tv_nsec = end_time.tv_nsec;
restart_block->poll.has_timeout = 1;
} else
restart_block->poll.has_timeout = 0;
ret = -ERESTART_RESTARTBLOCK;
}
return ret;
}
do_sys_poll函数,这个函数也是预处理函数,将用户态的数据拷贝到内核,然后调用do_poll函数工作,等do_poll返回以后再将处理完的数据拷贝到用户态。比较有意思时,它采用的块处理,每个walk块一个page。当最后的块不够page时,只分配块那么长的内存。代码同样非常简单。稍微看看就懂,不列出了。
do_poll函数,这个函数,是真正干活的函数,由于是select的姐妹篇,所以在超时设置,句柄查询方式方面跟select没有区别不废话,直接上代码。
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
struct timespec64 *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
u64 slack = 0;
unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; //是否忙查
unsigned long busy_start = 0;
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
pt->_qproc = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = select_estimate_accuracy(end_time); //超时处理
for (;;) { //外层循环,退出机制跟select一样,超时,文件句柄有事件,有信号
struct poll_list *walk;
bool can_busy_loop = false;
for (walk = list; walk != NULL; walk = walk->next) { //第二层训话,按句柄块处理
struct pollfd * pfd, * pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) { //单个句柄处理
/*
* Fish for events. If we found one, record it
* and kill poll_table->_qproc, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
*/
if (do_pollfd(pfd, pt, &can_busy_loop, //查询句柄是否有事件的函数
busy_flag)) {
count++;
pt->_qproc = NULL;
/* found something, stop busy polling */
busy_flag = 0;
can_busy_loop = false;
}
}
}
/*
* All waiters have already been registered, so don't provide
* a poll_table->_qproc to them on the next loop iteration.
*/
pt->_qproc = NULL;
if (!count) {
count = wait->error;
if (signal_pending(current))
count = -EINTR;
}
if (count || timed_out)
break;
/* only if found POLL_BUSY_LOOP sockets && not out of time */
if (can_busy_loop && !need_resched()) { //跟select一致
if (!busy_start) {
busy_start = busy_loop_current_time();
continue;
}
if (!busy_loop_timeout(busy_start))
continue;
}
busy_flag = 0;
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec64_to_ktime(*end_time);
to = &expire;
}
if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) //跟select一致。
timed_out = 1;
}
return count;
}
do_pollfd函数。查询单个文件句柄是否有事件,上代码。
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
bool *can_busy_poll,
unsigned int busy_flag)
{
unsigned int mask;
int fd;
mask = 0;
fd = pollfd->fd; //获取文件句柄
if (fd >= 0) {
struct fd f = fdget(fd); //获取句柄对应的文件
mask = POLLNVAL;
if (f.file) {
mask = DEFAULT_POLLMASK;
if (f.file->f_op->poll) {
pwait->_key = pollfd->events|POLLERR|POLLHUP;
pwait->_key |= busy_flag;
mask = f.file->f_op->poll(f.file, pwait); //调用文件对应的poll函数查询状态
if (mask & busy_flag)
*can_busy_poll = true;
}
/* Mask out unneeded events. */
mask &= pollfd->events | POLLERR | POLLHUP; //出去不需要的状态,查询需要的状态。
fdput(f);
}
}
pollfd->revents = mask; //保存查询的状态
return mask; //返回查询的状态
}
poll函数比select函数稍微改进是,poll函数没有文件句柄数量的限制,且poll函数也不会只能监听奇葩的句柄号是1024以下的文件。但是poll并不完美,每次调用poll函数都需要将所有的文件句柄下传到内核态。即使这次监听只有一个文件有事件,下次监听时也需要将所有文件句柄都下传,而且不只是下传哦,当有结果的时候内核还要上传的哦,即使只有一个文件有事件,也需要把所有状态都上传。当句柄足够多的时候,poll函数本身非常消耗cpu。因为它在内核时需要挨个轮询,当返回用户态以后还需要挨个轮询,两遍轮询存在性能瓶颈。时效性也有问题,如果出现事件的句柄在下传句柄的最后,则有可能消息处理不及时。鉴于poll函数有如此多的缺陷。开源社区的老大们开发了epoll系统调用。epoll几乎完美的解决了上述问题。因此在高负载情况下,大多数都是用epoll系统调用。