Linux # cat /proc/cpuinfo | grep "physical id" physical id : 0 physical id : 0 physical id : 0 physical id : 0 physical id : 1 physical id : 1 physical id : 1 physical id : 1
============================================================ Core and Socket Information (as reported by '/proc/cpuinfo') ============================================================
随着计算机技术(特别是以芯片为主的硬件技术)的快速发展,CPU 架构逐步从以前的单核时代进阶到如今的多核时代,在多核时代里,多颗 CPU 核心构成了一个小型的“微观世界”。每颗 CPU 各司其职,并与其他 CPU 交互,共同支撑起了一个“物理世界”。从这个意义上来看,我们更愿意称这样的“微观世界”为 CPU 拓扑,就像一个物理网络一样,各个网络节点通过拓扑关系建立起连接来支撑起整个通信系统。
单核 or 多核 or 多 CPU or 超线程 ?
在单核时代,为了提升 CPU 的处理能力,普遍的做法是提高 CPU 的主频率,但一味地提高频率对于 CPU 的功耗也是影响很大的(CPU 的功耗正比于主频的三次方)。
如果说前面咱们讨论的是 CPU 内部的“微观世界”,那么本节将跳出来,探讨一个系统级的“宏观世界”。
首先是 SMP,对称多处理器系统,指的是一种多个 CPU 处理器共享资源的电脑硬件架构,其中,每个 CPU 没有主从之分,地位平等,它们共享相同的物理资源,包括总线、内存、IO、操作系统等。每个 CPU 访问内存所用时间都是相同的,因此这种系统也被称为一致存储访问结构(UMA,Uniform Memory Access)。
ls /sys/devices/system/node/ # 如果只看到一个 node0 那就是 SMP 架构
为了应对大规模的系统要求(特别是云计算环境),就研制出了 NUMA 结构,即非一致存储访问结构。
这种结构引入了 CPU 分组的概念,用 Node 来表示,一个 Node 可能包含多个物理 CPU 封装,从而包含多个 CPU 物理核心。每个 Node 有自己独立的资源,包括内存、IO 等。每个 Node 之间可以通过互联模块总线(QPI)进行通信,所以,也就意味着每个 Node 上的 CPU 都可以访问到整个系统中的所有内存,但很显然,访问远端 Node 的内存比访问本地内存要耗时很多,这也是 NUMA 架构的问题所在,我们在基于 NUMA 架构开发上层应用程序要尽可能避免跨 Node 内存访问。
NUMA 架构在 SMP 架构的基础上通过分组的方式增强了可扩展性,但从性能上看,随着 CPU 数量的增加,并不能线性增加系统性能,原因就在于跨 Node 内存访问的问题。所以,一般 NUMA 架构最多支持几百个 CPU 就不错了。
但对于很多大型计算密集型的系统来说,NUMA 显然有些吃力,所以,后来又出现了 MPP 架构,即海量并行处理架构。这种架构也有分组的概念,但和 NUMA 不同的是,它不存在异地内存访问的问题,每个分组内的 CPU 都有自己本地的内存、IO,并且不与其他 CPU 共享,是一种完全无共享的架构,因此它的扩展性最好,可以支持多达数千个 CPU 的量级。
%Cpu(s):表示当前 CPU 的使用情况,如果要查看所有核(逻辑核)的使用情况,可以按下数字 “1” 查看。这里有几个参数,表示如下:
1 2 3 4 5 6 7 8
- us 用户空间占用 CPU 时间比例 - sy 系统占用 CPU 时间比例 - ni 用户空间改变过优先级的进程占用 CPU 时间比例 - id CPU 空闲时间比 - wa IO等待时间比(IO等待高时,可能是磁盘性能有问题了) - hi 硬件中断 - si 软件中断 - st steal time
每个进程的使用情况:这里可以罗列每个进程的使用情况,包括内存和 CPU 的,如果要看某个具体的进程,可以使用 top -p pid 查看。
和 top 一样的还有一个改进版的工具:htop,功能和 top 一样的,只不过比 top 表现更炫酷,使用更方便,可以看下它的效果。
/* * A structure to contain pointers to all per-process * namespaces - fs (mount), uts, network, sysvipc, etc. * * 'count' is the number of tasks holding a reference. * The count for each namespace, then, will be the number * of nsproxies pointing to it, not the number of tasks. * * The nsproxy is shared by tasks which share all namespaces. * As soon as a single namespace is cloned or unshared, the * nsproxy is copied. */ struct nsproxy { atomic_t count; struct uts_namespace *uts_ns; struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns; struct net *net_ns; }; extern struct nsproxy init_nsproxy;
/* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) */ #define INIT_TASK(tsk) \ { ...... .nsproxy = &init_nsproxy, ...... }
/* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ long do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) { // 创建进程描述符指针 struct task_struct *p; int trace = 0; long nr;
/* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK;
if (likely(!ptrace_event_enabled(current, trace))) trace = 0; }
// 复制进程描述符,返回值是 task_struct p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ if (!IS_ERR(p)) { struct completion vfork; struct pid *pid;
trace_sched_process_fork(current, p);
// 得到新进程描述符的 pid pid = get_task_pid(p, PIDTYPE_PID); nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr);
static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, int trace) { int retval; // 创建进程描述符指针 struct task_struct *p;
if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL);
/* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL);
/* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL);
/* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings. */ // 比如CLONE_PARENT时得检查当前signal flags是否为SIGNAL_UNKILLABLE,防止kill init进程。 if ((clone_flags & CLONE_PARENT) && current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL);
/* * If the new process will be in a different pid or user namespace * do not allow it to share a thread group or signal handlers or * parent with the forking task. */ if (clone_flags & CLONE_SIGHAND) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || (task_active_pid_ns(current) != current->nsproxy->pid_ns_for_children)) return ERR_PTR(-EINVAL); }
retval = security_task_create(clone_flags); if (retval) goto fork_out;
retval = -ENOMEM; // 复制当前的 task_struct p = dup_task_struct(current); if (!p) goto fork_out;
// 检查进程是否超过限制,由 OS 定义 if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; } current->flags &= ~PF_NPROC_EXCEEDED;
retval = copy_creds(p, clone_flags); if (retval < 0) goto bad_fork_free;
/* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ retval = -EAGAIN; // 检查进程数是否超过 max_threads,由内存大小定义 if (nr_threads >= max_threads) goto bad_fork_cleanup_count;
/* * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ int copy_namespaces(unsigned long flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns;
if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM;
/* * CLONE_NEWIPC must detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old * namespace are unreachable. In clone parlance, CLONE_SYSVSEM * means share undolist with parent, so we must forbid using * it along with CLONE_NEWIPC. */ if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) == (CLONE_NEWIPC | CLONE_SYSVSEM)) return -EINVAL;
new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); if (IS_ERR(new_ns)) return PTR_ERR(new_ns);
/* * Create new nsproxy and all of its the associated namespaces. * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */ static struct nsproxy *create_new_namespaces(unsigned long flags, struct task_struct *tsk, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct nsproxy *new_nsp; int err;
// 创建新的 nsproxy new_nsp = create_nsproxy(); if (!new_nsp) return ERR_PTR(-ENOMEM);
mnt namespace: struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; struct mount *p, *q; struct mount *old; struct mount *new; int copy_flags;
BUG_ON(!ns);
if (likely(!(flags & CLONE_NEWNS))) { get_mnt_ns(ns); return ns; }
old = ns->root; // 分配新的 mnt namespace new_ns = alloc_mnt_ns(user_ns); if (IS_ERR(new_ns)) return new_ns;
namespace_lock(); /* First pass: copy the tree topology */ // 首先 copy root 路径 copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); free_mnt_ns(new_ns); return ERR_CAST(new); } new_ns->root = new; list_add_tail(&new_ns->list, &new->mnt_list);
/* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts * as belonging to new namespace. We have already acquired a private * fs_struct, so tsk->fs->lock is not needed. */ // 为新进程设置 fs 信息 p = old; q = new; while (p) { q->mnt_ns = new_ns; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); rootmnt = &p->mnt; } if (&p->mnt == new_fs->pwd.mnt) { new_fs->pwd.mnt = mntget(&q->mnt); pwdmnt = &p->mnt; } } p = next_mnt(p, old); q = next_mnt(q, new); if (!q) break; while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(p, old); } namespace_unlock();
if (rootmnt) mntput(rootmnt); if (pwdmnt) mntput(pwdmnt);
return new_ns; }
可以看到,mount namespace 在新建时会新建一个新的 namespace,然后将父进程的 namespace 拷贝过来,并将 mount->mnt_ns 指向新的 namespace。接着设置进程的 root 路径以及当前路径到新的 namespace,然后为新进程设置新的 vfs 等。从这里就可以看出,在子进程中进行 mount 操作不会影响到父进程中的 mount 信息。
uts namespace:
1 2 3 4 5 6 7 8
static inline struct uts_namespace *copy_utsname(unsigned long flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { if (flags & CLONE_NEWUTS) return ERR_PTR(-EINVAL);