Linux内核部件分析-服务器评测

在linux内核中，有一种通用的双向循环链表，构成了各种队列的基础。链表的结构定义和相关函数均在include/linux/list.h中，下面就来全面的介绍这一链表的各种API。

struct list_head {
struct list_head *next, *prev;
};

这是链表的元素结构。因为是循环链表，表头和表中节点都是这一结构。有prev和next两个指针，分别指向链表中前一节点和后一节点。

#define LIST_HEAD_INIT(name) { &(name), &(name) }
#define LIST_HEAD(name) \
struct list_head name = LIST_HEAD_INIT(name)
static inline void INIT_LIST_HEAD(struct list_head *list)
{
list->next = list;
list->prev = list;
}

在初始化的时候，链表头的prev和next都是指向自身的。

static inline void __list_add(struct list_head *new,
struct list_head *prev,
struct list_head *next)
{
next->prev = new;
new->next = next;
new->prev = prev;
prev->next = new;
}
static inline void list_add(struct list_head *new, struct list_head *head)
{
__list_add(new, head, head->next);
}
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
__list_add(new, head->prev, head);
}

双向循环链表的实现，很少有例外情况，基本都可以用公共的方式来处理。这里无论是加第一个节点，还是其它的节点，使用的方法都一样。

另外，链表API实现时大致都是分为两层：一层外部的，如list_add、list_add_tail，用来消除一些例外情况，调用内部实现；一层是内部的，函数名前会加双下划线，如__list_add，往往是几个操作公共的部分，或者排除例外后的实现。

static inline void __list_del(struct list_head * prev, struct list_head * next)
{
next->prev = prev;
prev->next = next;
}
static inline void list_del(struct list_head *entry)
{
__list_del(entry->prev, entry->next);
entry->next = LIST_POISON1;
entry->prev = LIST_POISON2;
}
static inline void list_del_init(struct list_head *entry)
{
__list_del(entry->prev, entry->next);
INIT_LIST_HEAD(entry);
}

list_del是链表中节点的删除。之所以在调用__list_del后又把被删除元素的next、prev指向特殊的LIST_POSITION1和LIST_POSITION2，是为了调试未定义的指针。

list_del_init则是删除节点后，随即把节点中指针再次初始化，这种删除方式更为实用。

static inline void list_replace(struct list_head *old,
struct list_head *new)
{
new->next = old->next;
new->next->prev = new;
new->prev = old->prev;
new->prev->next = new;
}
static inline void list_replace_init(struct list_head *old,
struct list_head *new)
{
list_replace(old, new);
INIT_LIST_HEAD(old);
}

list_replace是将链表中一个节点old，替换为另一个节点new。从实现来看，即使old所在地链表只有old一个节点，new也可以成功替换，这就是双向循环链表可怕的通用之处。

list_replace_init将被替换的old随即又初始化。

static inline void list_move(struct list_head *list, struct list_head *head)
{
__list_del(list->prev, list->next);
list_add(list, head);
}
static inline void list_move_tail(struct list_head *list,
struct list_head *head)
{
__list_del(list->prev, list->next);
list_add_tail(list, head);
}

list_move的作用是把list节点从原链表中去除，并加入新的链表head中。

list_move_tail只在加入新链表时与list_move有所不同，list_move是加到head之后的链表头部，而list_move_tail是加到head之前的链表尾部。

static inline int list_is_last(const struct list_head *list,
const struct list_head *head)
{
return list->next == head;
}

list_is_last 判断list是否处于head链表的尾部。

static inline int list_empty(const struct list_head *head)
{
return head->next == head;
}
static inline int list_empty_careful(const struct list_head *head)
{
struct list_head *next = head->next;
return (next == head) && (next == head->prev);
}

list_empty 判断head链表是否为空，为空的意思就是只有一个链表头head。

list_empty_careful 同样是判断head链表是否为空，只是检查更为严格。

static inline int list_is_singular(const struct list_head *head)
{
return !list_empty(head) && (head->next == head->prev);
}

list_is_singular 判断head中是否只有一个节点，即除链表头head外只有一个节点。

static inline void __list_cut_position(struct list_head *list,
struct list_head *head, struct list_head *entry)
{
struct list_head *new_first = entry->next;
list->next = head->next;
list->next->prev = list;
list->prev = entry;
entry->next = list;
head->next = new_first;
new_first->prev = head;
}
static inline void list_cut_position(struct list_head *list,
struct list_head *head, struct list_head *entry)
{
if (list_empty(head))
return;
if (list_is_singular(head) &&
(head->next != entry && head != entry))
return;
if (entry == head)
INIT_LIST_HEAD(list);
else
__list_cut_position(list, head, entry);
}

list_cut_position 用于把head链表分为两个部分。从head->next一直到entry被从head链表中删除，加入新的链表list。新链表list应该是空的，或者原来的节点都可以被忽略掉。可以看到，list_cut_position中排除了一些意外情况，保证调用__list_cut_position时至少有一个元素会被加入新链表。

static inline void __list_splice(const struct list_head *list,
struct list_head *prev,
struct list_head *next)
{
struct list_head *first = list->next;
struct list_head *last = list->prev;
first->prev = prev;
prev->next = first;
last->next = next;
next->prev = last;
}
static inline void list_splice(const struct list_head *list,
struct list_head *head)
{
if (!list_empty(list))
__list_splice(list, head, head->next);
}
static inline void list_splice_tail(struct list_head *list,
struct list_head *head)
{
if (!list_empty(list))
__list_splice(list, head->prev, head);
}

list_splice的功能和list_cut_position正相反，它合并两个链表。list_splice把list链表中的节点加入head链表中。在实际操作之前，要先判断list链表是否为空。它保证调用__list_splice时list链表中至少有一个节点可以被合并到head链表中。

list_splice_tail只是在合并链表时插入的位置不同。list_splice是把原来list链表中的节点全加到head链表的头部，而list_splice_tail则是把原来list链表中的节点全加到head链表的尾部。

static inline void list_splice_init(struct list_head *list,
struct list_head *head)
{
if (!list_empty(list)) {
__list_splice(list, head, head->next);
INIT_LIST_HEAD(list);
}
}
static inline void list_splice_tail_init(struct list_head *list,
struct list_head *head)
{
if (!list_empty(list)) {
__list_splice(list, head->prev, head);
INIT_LIST_HEAD(list);
}
}

list_splice_init 除了完成list_splice的功能，还把变空了的list链表头重新初始化。

list_splice_tail_init 除了完成list_splice_tail的功能，还吧变空了得list链表头重新初始化。

list操作的API大致如以上所列，包括链表节点添加与删除、节点从一个链表转移到另一个链表、链表中一个节点被替换为另一个节点、链表的合并与拆分、查看链表当前是否为空或者只有一个节点。接下来，是操作链表遍历时的一些宏，我们也简单介绍一下。

#define list_entry(ptr, type, member) \
container_of(ptr, type, member)

list_entry主要用于从list节点查找其内嵌在的结构。比如定义一个结构struct A{ struct list_head list; }; 如果知道结构中链表的地址ptrList，就可以从ptrList进而获取整个结构的地址(即整个结构的指针) struct A *ptrA = list_entry(ptrList, struct A, list);

这种地址翻译的技巧是linux的拿手好戏，container_of随处可见，只是链表节点多被封装在更复杂的结构中，使用专门的list_entry定义也是为了使用方便。

#define list_first_entry(ptr, type, member) \
list_entry((ptr)->next, type, member)

list_first_entry是将ptr看完一个链表的链表头，取出其中第一个节点对应的结构地址。使用list_first_entry是应保证链表中至少有一个节点。

#define list_for_each(pos, head) \
for (pos = (head)->next; prefetch(pos->next), pos != (head); \
pos = pos->next)

list_for_each循环遍历链表中的每个节点，从链表头部的第一个节点，一直到链表尾部。中间的prefetch是为了利用平台特性加速链表遍历，在某些平台下定义为空，可以忽略。

#define __list_for_each(pos, head) \
for (pos = (head)->next; pos != (head); pos = pos->next)

__list_for_each与list_for_each没什么不同，只是少了prefetch的内容，实现上更为简单易懂。

#define list_for_each_prev(pos, head) \
for (pos = (head)->prev; prefetch(pos->prev), pos != (head); \
pos = pos->prev)

list_for_each_prev与list_for_each的遍历顺序相反，从链表尾逆向遍历到链表头。

#define list_for_each_safe(pos, n, head) \
for (pos = (head)->next, n = pos->next; pos != (head); \
pos = n, n = pos->next)

list_for_each_safe 也是链表顺序遍历，只是更加安全。即使在遍历过程中，当前节点从链表中删除，也不会影响链表的遍历。参数上需要加一个暂存的链表节点指针n。

#define list_for_each_prev_safe(pos, n, head) \
for (pos = (head)->prev, n = pos->prev; \
prefetch(pos->prev), pos != (head); \
pos = n, n = pos->prev)

list_for_each_prev_safe 与list_for_each_prev同样是链表逆序遍历，只是加了链表节点删除保护。

#define list_for_each_entry(pos, head, member) \
for (pos = list_entry((head)->next, typeof(*pos), member); \
prefetch(pos->member.next), &pos->member != (head); \
pos = list_entry(pos->member.next, typeof(*pos), member))

list_for_each_entry不是遍历链表节点，而是遍历链表节点所嵌套进的结构。这个实现上较为复杂，但可以等价于list_for_each加上list_entry的组合。

#define list_for_each_entry_reverse(pos, head, member) \
for (pos = list_entry((head)->prev, typeof(*pos), member); \
prefetch(pos->member.prev), &pos->member != (head); \
pos = list_entry(pos->member.prev, typeof(*pos), member))

list_for_each_entry_reverse 是逆序遍历链表节点所嵌套进的结构，等价于list_for_each_prev加上list_etnry的组合。

#define list_for_each_entry_continue(pos, head, member) \
for (pos = list_entry(pos->member.next, typeof(*pos), member); \
prefetch(pos->member.next), &pos->member != (head); \
pos = list_entry(pos->member.next, typeof(*pos), member))

list_for_each_entry_continue也是遍历链表上的节点嵌套的结构。只是并非从链表头开始，而是从结构指针的下一个结构开始，一直到链表尾部。

#define list_for_each_entry_continue_reverse(pos, head, member) \
for (pos = list_entry(pos->member.prev, typeof(*pos), member); \
prefetch(pos->member.prev), &pos->member != (head); \
pos = list_entry(pos->member.prev, typeof(*pos), member))

list_for_each_entry_continue_reverse 是逆序遍历链表上的节点嵌套的结构。只是并非从链表尾开始，而是从结构指针的前一个结构开始，一直到链表头部。

#define list_for_each_entry_from(pos, head, member) \
for (; prefetch(pos->member.next), &pos->member != (head); \
pos = list_entry(pos->member.next, typeof(*pos), member))

list_for_each_entry_from 是从当前结构指针pos开始，顺序遍历链表上的结构指针。

#define list_for_each_entry_safe(pos, n, head, member) \
for (pos = list_entry((head)->next, typeof(*pos), member), \
n = list_entry(pos->member.next, typeof(*pos), member); \
&pos->member != (head); \
pos = n, n = list_entry(n->member.next, typeof(*n), member))

list_for_each_entry_safe 也是顺序遍历链表上节点嵌套的结构。只是加了删除节点的保护。

#define list_for_each_entry_safe_continue(pos, n, head, member) \
for (pos = list_entry(pos->member.next, typeof(*pos), member), \
n = list_entry(pos->member.next, typeof(*pos), member); \
&pos->member != (head); \
pos = n, n = list_entry(n->member.next, typeof(*n), member))

list_for_each_entry_safe_continue 是从pos的下一个结构指针开始，顺序遍历链表上的结构指针，同时加了节点删除保护。

#define list_for_each_entry_safe_from(pos, n, head, member) \
for (n = list_entry(pos->member.next, typeof(*pos), member); \
&pos->member != (head); \
pos = n, n = list_entry(n->member.next, typeof(*n), member))

list_for_each_entry_safe_from 是从pos开始，顺序遍历链表上的结构指针，同时加了节点删除保护。

#define list_for_each_entry_safe_reverse(pos, n, head, member) \
for (pos = list_entry((head)->prev, typeof(*pos), member), \
n = list_entry(pos->member.prev, typeof(*pos), member); \
&pos->member != (head); \
pos = n, n = list_entry(n->member.prev, typeof(*n), member))

list_for_each_entry_safe_reverse 是从pos的前一个结构指针开始，逆序遍历链表上的结构指针，同时加了节点删除保护。

至此为止，我们介绍了linux中双向循环链表的结构、所有的操作函数和遍历宏定义。相信以后在linux代码中遇到链表的使用，不会再陌生。

在任何处理器平台下，都会有一些原子性操作，供操作系统使用，我们这里只讲x86下面的。在单处理器情况下，每条指令的执行都是原子性的，但在多处理器情况下，只有那些单独的读操作或写操作才是原子性的。为了弥补这一缺点，x86提供了附加的lock前缀，使带lock前缀的读修改写指令也能原子性执行。带lock前缀的指令在操作时会锁住总线，使自身的执行即使在多处理器间也是原子性执行的。xchg指令不带lock前缀也是原子性执行，也就是说xchg执行时默认会锁内存总线。原子性操作是线程间同步的基础，linux专门定义了一种只进行原子操作的类型atomic_t，并提供相关的原子读写调用API。本节就来分析这些原子操作在x86下的实现。

typedef struct {

volatile int counter;

} atomic_t;

原子类型其实是int类型，只是禁止寄存器对其暂存。

#define ATOMIC_INIT(i) { (i) }

原子类型的初始化。32位x86平台下atomic API在arch/x86/include/asm/atomic_32.h中实现。

static inline int atomic_read(const atomic_t *v)

{

return v->counter;

}

static inline void atomic_set(atomic_t *v, int i)

{

v->counter = i;

}

单独的读操作或者写操作，在x86下都是原子性的。

static inline void atomic_add(int i, atomic_t *v)

{

asm volatile(LOCK_PREFIX “addl %1,%0”

: “+m” (v->counter)

: “ir” (i));

}

static inline void atomic_sub(int i, atomic_t *v)

{

asm volatile(LOCK_PREFIX “subl %1,%0”

: “+m” (v->counter)

: “ir” (i));

}

atomic_add和atomic_sub属于读修改写操作，实现时需要加lock前缀。

static inline int atomic_sub_and_test(int i, atomic_t *v)

{

unsigned char c;

asm volatile(LOCK_PREFIX “subl %2,%0; sete %1”

: “+m” (v->counter), “=qm” (c)

: “ir” (i) : “memory”);

return c;

}

atomic_sub_and_test执行完减操作后检查结果是否为0。

static inline void atomic_inc(atomic_t *v)

{

asm volatile(LOCK_PREFIX “incl %0”

: “+m” (v->counter));

}

static inline void atomic_dec(atomic_t *v)

{

asm volatile(LOCK_PREFIX “decl %0”

: “+m” (v->counter));

}

atomic_inc和atomic_dec是递增递减操作。

static inline int atomic_dec_and_test(atomic_t *v)

{

unsigned char c;

asm volatile(LOCK_PREFIX “decl %0; sete %1”

: “+m” (v->counter), “=qm” (c)

: : “memory”);

return c != 0;

}

atomic_dec_and_test在递减后检查结果是否为0。

static inline int atomic_inc_and_test(atomic_t *v)

{

unsigned char c;

asm volatile(LOCK_PREFIX “incl %0; sete %1”

: “+m” (v->counter), “=qm” (c)

: : “memory”);

return c != 0;

}

atomic_inc_and_test在递增后检查结果是否为0。

static inline int atomic_add_negative(int i, atomic_t *v)

{

unsigned char c;

asm volatile(LOCK_PREFIX “addl %2,%0; sets %1”

: “+m” (v->counter), “=qm” (c)

: “ir” (i) : “memory”);

return c;

}

atomic_add_negative在加操作后检查结果是否为负数。

static inline int atomic_add_return(int i, atomic_t *v)

{

int __i;

#ifdef CONFIG_M386

unsigned long flags;

if (unlikely(boot_cpu_data.x86 <= 3))

goto no_xadd;

#endif

/* Modern 486+ processor */

__i = i;

asm volatile(LOCK_PREFIX “xaddl %0, %1”

: “+r” (i), “+m” (v->counter)

: : “memory”);

return i + __i;

#ifdef CONFIG_M386

no_xadd: /* Legacy 386 processor */

local_irq_save(flags);

__i = atomic_read(v);

atomic_set(v, i + __i);

local_irq_restore(flags);

return i + __i;

#endif

}

atomic_add_return 不仅执行加操作，而且把相加的结果返回。它是通过xadd这一指令实现的。

static inline int atomic_sub_return(int i, atomic_t *v)

{

return atomic_add_return(-i, v);

}

atomic_sub_return 不仅执行减操作，而且把相减的结果返回。它是通过atomic_add_return实现的。

static inline int atomic_cmpxchg(atomic_t *v, int old, int new)

{

return cmpxchg(&v->counter, old, new);

}

#define cmpxchg(ptr, o, n) \

((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \

(unsigned long)(n), \

sizeof(*(ptr))))

static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,

unsigned long new, int size)

{

unsigned long prev;

switch (size) {

case 1:

asm volatile(LOCK_PREFIX “cmpxchgb %b1,%2”

: “=a”(prev)

: “q”(new), “m”(*__xg(ptr)), “0”(old)

: “memory”);

return prev;

case 2:

asm volatile(LOCK_PREFIX “cmpxchgw %w1,%2”

: “=a”(prev)

: “r”(new), “m”(*__xg(ptr)), “0”(old)

: “memory”);

return prev;

case 4:

asm volatile(LOCK_PREFIX “cmpxchgl %k1,%2”

: “=a”(prev)

: “r”(new), “m”(*__xg(ptr)), “0”(old)

: “memory”);

return prev;

case 8:

asm volatile(LOCK_PREFIX “cmpxchgq %1,%2”

: “=a”(prev)

: “r”(new), “m”(*__xg(ptr)), “0”(old)

: “memory”);

return prev;

}

return old;

}

atomic_cmpxchg是由cmpxchg指令完成的。它把旧值同atomic_t类型的值相比较，如果相同，就把新值存入atomic_t类型的值中，返回atomic_t类型变量中原有的值。

static inline int atomic_xchg(atomic_t *v, int new)

{

return xchg(&v->counter, new);

}

#define xchg(ptr, v) \

((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr))))

static inline unsigned long __xchg(unsigned long x, volatile void *ptr,

int size)

{

switch (size) {

case 1:

asm volatile(“xchgb %b0,%1”

: “=q” (x)

: “m” (*__xg(ptr)), “0” (x)

: “memory”);

break;

case 2:

asm volatile(“xchgw %w0,%1”

: “=r” (x)

: “m” (*__xg(ptr)), “0” (x)

: “memory”);

break;

case 4:

asm volatile(“xchgl %k0,%1”

: “=r” (x)

: “m” (*__xg(ptr)), “0” (x)

: “memory”);

break;

case 8:

asm volatile(“xchgq %0,%1”

: “=r” (x)

: “m” (*__xg(ptr)), “0” (x)

: “memory”);

break;

}

return x;

}

atomic_xchg则是将新值存入atomic_t类型的变量，并将变量的旧值返回。它使用xchg指令实现。

* atomic_add_unless – add unless the number is already a given value

* @v: pointer of type atomic_t

* @a: the amount to add to v…

* @u: …unless v is equal to u.

* Atomically adds @a to @v, so long as @v was not already @u.

* Returns non-zero if @v was not @u, and zero otherwise.

static inline int atomic_add_unless(atomic_t *v, int a, int u)

{

int c, old;

c = atomic_read(v);

for (;;) {

if (unlikely(c == (u)))

break;

old = atomic_cmpxchg((v), c, c + (a));

if (likely(old == c))

break;

c = old;

}

return c != (u);

}

atomic_add_unless的功能比较特殊。它检查v是否等于u，如果不是则把v的值加上a，返回值表示相加前v是否等于u。因为在atomic_read和atomic_cmpxchg中间可能有其它的写操作，所以要循环检查自己的值是否被写进去。

#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)

#define atomic_inc_return(v) (atomic_add_return(1, v))

#define atomic_dec_return(v) (atomic_sub_return(1, v))

atomic_inc_not_zero在v值不是0时加1。

atomic_inc_return对v值加1，并返回相加结果。

atomic_dec_return对v值减1，并返回相减结果。

#define atomic_clear_mask(mask, addr) \

asm volatile(LOCK_PREFIX “andl %0,%1” \

: : “r” (~(mask)), “m” (*(addr)) : “memory”)

atomic_clear_mask清除变量某些位。

#define atomic_set_mask(mask, addr) \

asm volatile(LOCK_PREFIX “orl %0,%1” \

: : “r” (mask), “m” (*(addr)) : “memory”)

atomic_set_mask将变量的某些位置位。

/* Atomic operations are already serializing on x86 */

#define smp_mb__before_atomic_dec() barrier()

#define smp_mb__after_atomic_dec() barrier()

#define smp_mb__before_atomic_inc() barrier()

#define smp_mb__after_atomic_inc() barrier()

因为x86的atomic操作大多使用原子指令或者带lock前缀的指令。带lock前缀的指令执行前会完成之前的读写操作，对于原子操作来说不会受之前对同一位置的读写操作，所以这里只是用空操作barrier()代替。barrier()的作用相当于告诉编译器这里有一个内存屏障，放弃在寄存器中的暂存值，重新从内存中读入。

本节的atomic_t类型操作是最基础的，为了介绍下面的内容，必须先介绍它。如果可以使用atomic_t类型代替临界区操作，也可以加快不少速度。

kref是一个引用计数器，它被嵌套进其它的结构中，记录所嵌套结构的引用计数，并在计数清零时调用相应的清理函数。kref的原理和实现都非常简单，但要想用好却不容易，或者说kref被创建就是为了跟踪复杂情况下地结构引用销毁情况。所以这里先介绍kref的实现，再介绍其使用规则。

kref的头文件在include/linux/kref.h，实现在lib/kref.c。闲话少说，上代码。

struct kref {

atomic_t refcount;

};

可以看到，kref的结构中就包含一个atomic_t类型的计数值。atomic_t是原子类型，对其操作都要求是原子执行的，有专门的原子操作API执行，即使在多处理器间也保持原子性。使用atomic_t类型充当计数值，就省去了加锁去锁的过程。

void kref_set(struct kref *kref, int num)

{

atomic_set(&kref->refcount, num);

smp_mb();

}

kref_set 设置kref的初始计数值。具体计数值设置由原子操作atomic_set完成。之后还有一个smp_mb()是为了增加内存屏障，保证这一写操作会在之后的读写操作完成之前完成。

void kref_init(struct kref *kref)

{

kref_set(kref, 1);

}

kref_init 初始化kref的计数值为1。

void kref_get(struct kref *kref)

{

WARN_ON(!atomic_read(&kref->refcount));

atomic_inc(&kref->refcount);

smp_mb__after_atomic_inc();

}

kref_get递增kref的计数值。

int kref_put(struct kref *kref, void (*release)(struct kref *kref))

{

WARN_ON(release == NULL);

WARN_ON(release == (void (*)(struct kref *))kfree);

if (atomic_dec_and_test(&kref->refcount)) {

release(kref);

return 1;

}

return 0;

}

kref_put递减kref的计数值，如果计数值减为0，说明kref所指向的结构生命周期结束，会执行release释放函数。

所以说kref的API很简单，kref_init和kref_set基本都是初始时才会用到，平时常用的就是kref_get和kref_put。一旦在kref_put时计数值清零，立即调用结束函数。

kref设计得如此简单，是为了能灵活地用在各种结构的生命周期管理中。要用好它可不简单，好在Documentation/kref.txt中为我们总结了一些使用规则，下面简单翻译一下。

对于那些用在多种场合，被到处传递的结构，如果没有引用计数，bug几乎总是肯定的事。所以我们需要kref。kref允许我们在已有的结构中方便地添加引用计数。

你可以以如下方式添加kref到你的数据结构中：

struct my_data {

struct kref refcount;

};

kref可以出现在你结构中的任意位置。

在分配kref后你必须初始化它，可以调用kref_init，把kref计数值初始为1。

struct my_data *data;

data = kmalloc(sizeof(*data), GFP_KERNEL);

if(!data)

return -ENOMEM;

kref_init(&data->refcount);

初始化之后，kref的使用应该遵循以下三条规则：

1）如果你制造了一个结构指针的非暂时性副本，特别是当这个副本指针会被传递到其它执行线程时，你必须在传递副本指针之前执行kref_get：

kref_put(&data->refcount);

2）当你使用完，不再需要结构的指针，必须执行kref_put。如果这是结构指针的最后一个引用，release函数会被调用。如果代码绝不会在没有拥有引用计数的请求下去调用kref_get，在kref_put时就不需要加锁。

kref_put(&data->refcount, data_release);

3）如果代码试图在还没拥有引用计数的情况下就调用kref_get，就必须串行化kref_put和kref_get的执行。因为很可能在kref_get执行之前或者执行中，kref_put就被调用并把整个结构释放掉了。

例如，你分配了一些数据并把它传递到其它线程去处理：

void data_release(struct kref *kref)

{

struct my_data *data = container_of(kref, struct my_data, refcount);

kree(data);

}

void more_data_handling(void *cb_data)

{

struct my_data *data = cb_data;

. do stuff with data here

kref_put(&data->refcount, data_release);

}

int my_data_handler(void)

{

int rv = 0;

struct my_data *data;

struct task_struct *task;

data = kmalloc(sizeof(*data), GFP_KERNEL);

if (!data)

return -ENOMEM;

kref_init(&data->refcount);

kref_get(&data->refcount);

task = kthread_run(more_data_handling, data, “more_data_handling”);

if (task == ERR_PTR(-ENOMEM)){

rv = -ENOMEM;

goto out;

}

. do stuff with data here

out:

kref_put(&data->refcount, data_release);

return rv;

}

这样做，无论两个线程的执行顺序是怎样的都无所谓，kref_put知道何时数据不再有引用计数，可以被销毁。kref_get()调用不需要加锁，因为在my_data_handler中调用kref_get时已经拥有一个引用。同样地原因，kref_put也不需要加锁。

要注意规则一中的要求，必须在传递指针之前调用kref_get。决不能写下面的代码：

task = kthread_run(more_data_handling, data, “more_data_handling”);

if(task == ERR_PTR(-ENOMEM)) {

rv = -ENOMEM;

goto out;

}

else {

/* BAD BAD BAD – get is after the handoff */

kref_get(&data->refcount);

不要认为自己在使用上面的代码时知道自己在做什么。首先，你可能并不知道你在做什么。其次，你可能知道你在做什么（在部分加锁情况下上面的代码也是正确的），但一些修改或者复制你代码的人并不知道你在做什么。这是一种坏的使用方式。

当然在部分情况下也可以优化对get和put的使用。例如，你已经完成了对这个数据的处理，并要把它传递给其它线程，就不需要再做多余的get和put了。

/* Silly extra get and put */

kref_get(&obj->ref);

enqueue(obj);

kref_put(&obj->ref, obj_cleanup);

只需要做enqueue操作即可，可以在其后加一条注释。

enqueue(obj);

/* We are done with obj , so we pass our refcount off to the queue. DON’T TOUCH obj AFTER HERE! */

第三条规则是处理起来最麻烦的。例如，你有一列数据，每条数据都有kref计数，你希望获取第一条数据。但你不能简单地把第一条数据从链表中取出并调用kref_get。这违背了第三条，在调用kref_get前你并没有一个引用。你需要增加一个mutex（或者其它锁）。

static DEFINE_MUTEX(mutex);

static LIST_HEAD(q);

struct my_data

{

struct kref refcount;

struct list_head link;

};

static struct my_data *get_entry()

{

struct my_data *entry = NULL;

mutex_lock(&mutex);

if(!list_empty(&q)){

entry = container_of(q.next, struct my_q_entry, link);

kref_get(&entry->refcount);

}

mutex_unlock(&mutex);

return entry;

}

static void release_entry(struct kref *ref)

{

struct my_data *entry = container_of(ref, struct my_data, refcount);

list_del(&entry->link);

kfree(entry);

}

static void put_entry(struct my_data *entry)

{

mutex_lock(&mutex);

kref_put(&entry->refcount, release_entry);

mutex_unlock(&mutex);

}

如果你不想在整个释放过程中都加锁，kref_put的返回值就有用了。例如你不想在加锁情况下调用kfree，你可以如下使用kref_put。

static void release_entry(struct kref *ref)

{

}

static void put_entry(struct my_data *entry)

{

mutex_lock(&mutex);

if(kref_put(&entry->refcount, release_entry)){

list_del(&entry->link);

mutex_unlock(&mutex);

kfree(entry);

}

else

mutex_unlock(&mutex);

}

如果你在撤销结构的过程中需要调用其它的需要较长时间的函数，或者函数也可能要获取同样地互斥锁，这样做就很有用了。但要注意在release函数中做完撤销工作会使代码看起来更整洁。

前面我们说到过list_head，这是linux中通用的链表形式，双向循环链表，功能强大，实现简单优雅。可如果您认为list_head就是链表的极致，应该在linux链表界一统天下，那可就错了。据我所知，linux内核代码中至少还有两种链表能占有一席之地。一种就是hlist，一种就是本节要介绍的klist。虽然三者不同，但hlist和klist都可以看成是从list_head中发展出来的，用于特殊的链表使用情景。hlist是用于哈希表中。众所周知，哈希表主要就是一个哈希数组，为了解决映射冲突的问题，常常把哈希数组的每一项做成一个链表，这样有多少重复的都可以链进去。但哈希数组的项很多，list_head的话每个链表头都需要两个指针的空间，在稀疏的哈希表中实在是一种浪费，于是就发明了hlist。hlist有两大特点，一是它的链表头只需要一个指针，二是它的每一项都可以找到自己的前一节点，也就是说它不再循环，但仍是双向。令人不解的是，hlist的实现太绕了，比如它明明可以直接指向前一节点，却偏偏指向指针地址，还是前一节点中指向后一节点的指针地址。即使这种设计在实现时占便宜，但它理解上带来的不便已经远远超过实现上带来的小小便利。

同hlist一样，klist也是为了适应某类特殊情形的要求。考虑一个被简化的情形，假设一些设备被链接在设备链表中，一个线程命令卸载某设备，即将其从设备链表中删除，但这时该设备正在使用中，这时就出现了冲突。当前可以设置临界区并加锁，但因为使用一个设备而锁住整个设备链表显然是不对的；又或者可以从设备本身做文章，让线程阻塞，这当然也可以。但我们上节了解了kref，就该知道linux对待这种情况的风格，给它一个引用计数kref，等计数为零就删除。klist就是这么干的，它把kref直接保存在了链表节点上。之前说到有线程要求删除设备，之前的使用仍存在，所以不能实际删除，但不应该有新的应用访问到该设备。klist就提供了一种让节点在链表上隐身的方法。下面还是来看实际代码吧。

klist的头文件是include/linux/klist.h，实现在lib/klist.c。

struct klist_node;
struct klist {
spinlock_t k_lock;
struct list_head k_list;
void (*get)(struct klist_node *);
void (*put)(struct klist_node *);
} __attribute__ ((aligned (4)));
#define KLIST_INIT(_name, _get, _put) \
{ .k_lock = __SPIN_LOCK_UNLOCKED(_name.k_lock), \
.k_list = LIST_HEAD_INIT(_name.k_list), \
.get = _get, \
.put = _put, }
#define DEFINE_KLIST(_name, _get, _put) \
struct klist _name = KLIST_INIT(_name, _get, _put)
extern void klist_init(struct klist *k, void (*get)(struct klist_node *),
void (*put)(struct klist_node *));
struct klist_node {
void *n_klist; /* never access directly */
struct list_head n_node;
struct kref n_ref;
};

可以看到，klist的链表头是struct klist结构，链表节点是struct klist_node结构。先看struct klist，除了包含链表需要的k_list，还有用于加锁的k_lock。剩余的get()和put()函数是用于struct klist_node嵌入在更大的结构中，这样在节点初始时调用get()，在节点删除时调用put()，以表示链表中存在对结构的引用。再看struct klist_node，除了链表需要的n_node，还有一个引用计数n_ref。还有一个比较特殊的指针n_klist，n_klist是指向链表头struct klist的，但它的第0位用来表示是否该节点已被请求删除，如果已被请求删除则在链表循环时是看不到这一节点的，循环函数将其略过。现在你明白为什么非要在struct klist的定义后加上__attribute__((aligned(4)))。不过说实话这样在x86下仍然不太保险，但linux选择了相信gcc，毕竟是多年的战友和兄弟了，相互知根知底。

看过这两个结构，想必大家已经较为清楚了，下面就来看看它们的实现。

/*
* Use the lowest bit of n_klist to mark deleted nodes and exclude
* dead ones from iteration.
*/
#define KNODE_DEAD 1LU
#define KNODE_KLIST_MASK ~KNODE_DEAD
static struct klist *knode_klist(struct klist_node *knode)
{
return (struct klist *)
((unsigned long)knode->n_klist & KNODE_KLIST_MASK);
}
static bool knode_dead(struct klist_node *knode)
{
return (unsigned long)knode->n_klist & KNODE_DEAD;
}
static void knode_set_klist(struct klist_node *knode, struct klist *klist)
{
knode->n_klist = klist;
/* no knode deserves to start its life dead */
WARN_ON(knode_dead(knode));
}
static void knode_kill(struct klist_node *knode)
{
/* and no knode should die twice ever either, see we’re very humane */
WARN_ON(knode_dead(knode));
*(unsigned long *)&knode->n_klist |= KNODE_DEAD;
}

前面的四个函数都是内部静态函数，帮助API实现的。knode_klist()是从节点找到链表头。knode_dead()是检查该节点是否已被请求删除。

knode_set_klist设置节点的链表头。knode_kill将该节点请求删除。细心的话大家会发现这四个函数是对称的，而且都是操作节点的内部函数。

void klist_init(struct klist *k, void (*get)(struct klist_node *),
void (*put)(struct klist_node *))
{
INIT_LIST_HEAD(&k->k_list);
spin_lock_init(&k->k_lock);
k->get = get;
k->put = put;
}

klist_init，初始化klist。

static void add_head(struct klist *k, struct klist_node *n)
{
spin_lock(&k->k_lock);
list_add(&n->n_node, &k->k_list);
spin_unlock(&k->k_lock);
}
static void add_tail(struct klist *k, struct klist_node *n)
{
spin_lock(&k->k_lock);
list_add_tail(&n->n_node, &k->k_list);
spin_unlock(&k->k_lock);
}
static void klist_node_init(struct klist *k, struct klist_node *n)
{
INIT_LIST_HEAD(&n->n_node);
kref_init(&n->n_ref);
knode_set_klist(n, k);
if (k->get)
k->get(n);
}

又是三个内部函数，add_head()将节点加入链表头，add_tail()将节点加入链表尾，klist_node_init()是初始化节点。注意在节点的引用计数初始化时，因为引用计数变为1，所以也要调用相应的get()函数。

void klist_add_head(struct klist_node *n, struct klist *k)
{
klist_node_init(k, n);
add_head(k, n);
}
void klist_add_tail(struct klist_node *n, struct klist *k)
{
klist_node_init(k, n);
add_tail(k, n);
}

klist_add_head()将节点初始化，并加入链表头。

klist_add_tail()将节点初始化，并加入链表尾。

它们正是用上面的三个内部函数实现的，可见linux内核中对函数复用有很强的执念，其实这里add_tail和add_head是不用的，纵观整个文件，也只有klist_add_head()和klist_add_tail()对它们进行了调用。

void klist_add_after(struct klist_node *n, struct klist_node *pos)
{
struct klist *k = knode_klist(pos);
klist_node_init(k, n);
spin_lock(&k->k_lock);
list_add(&n->n_node, &pos->n_node);
spin_unlock(&k->k_lock);
}
void klist_add_before(struct klist_node *n, struct klist_node *pos)
{
struct klist *k = knode_klist(pos);
klist_node_init(k, n);
spin_lock(&k->k_lock);
list_add_tail(&n->n_node, &pos->n_node);
spin_unlock(&k->k_lock);
}

klist_add_after()将节点加到指定节点后面。

klist_add_before()将节点加到指定节点前面。

这两个函数都是对外提供的API。在list_head中都没有看到有这种API，所以说需求决定了接口。虽说只有一步之遥，klist也不愿让外界介入它的内部实现。

之前出现的API都太常见了，既没有使用引用计数，又没有跳过请求删除的节点。所以klist的亮点在下面，klist链表的遍历。

struct klist_iter {
struct klist *i_klist;
struct klist_node *i_cur;
};
extern void klist_iter_init(struct klist *k, struct klist_iter *i);
extern void klist_iter_init_node(struct klist *k, struct klist_iter *i,
struct klist_node *n);
extern void klist_iter_exit(struct klist_iter *i);
extern struct klist_node *klist_next(struct klist_iter *i);

以上就是链表遍历需要的辅助结构struct klist_iter，和遍历用到的四个函数。

struct klist_waiter {
struct list_head list;
struct klist_node *node;
struct task_struct *process;
int woken;
};
static DEFINE_SPINLOCK(klist_remove_lock);
static LIST_HEAD(klist_remove_waiters);
static void klist_release(struct kref *kref)
{
struct klist_waiter *waiter, *tmp;
struct klist_node *n = container_of(kref, struct klist_node, n_ref);
WARN_ON(!knode_dead(n));
list_del(&n->n_node);
spin_lock(&klist_remove_lock);
list_for_each_entry_safe(waiter, tmp, &klist_remove_waiters, list) {
if (waiter->node != n)
continue;
waiter->woken = 1;
mb();
wake_up_process(waiter->process);
list_del(&waiter->list);
}
spin_unlock(&klist_remove_lock);
knode_set_klist(n, NULL);
}
static int klist_dec_and_del(struct klist_node *n)
{
return kref_put(&n->n_ref, klist_release);
}
static void klist_put(struct klist_node *n, bool kill)
{
struct klist *k = knode_klist(n);
void (*put)(struct klist_node *) = k->put;
spin_lock(&k->k_lock);
if (kill)
knode_kill(n);
if (!klist_dec_and_del(n))
put = NULL;
spin_unlock(&k->k_lock);
if (put)
put(n);
}
/**
* klist_del – Decrement the reference count of node and try to remove.
* @n: node we’re deleting.
*/
void klist_del(struct klist_node *n)
{
klist_put(n, true);
}

以上的内容乍一看很难理解，其实都是klist实现必须的。因为使用kref动态删除，自然需要一个计数降为零时调用的函数klist_release。

klist_dec_and_del()就是对kref_put()的包装，起到减少节点引用计数的功能。

至于为什么会出现一个新的结构struct klist_waiter，也很简单。之前说有线程申请删除某节点，但节点的引用计数仍在，所以只能把请求删除的线程阻塞，就是用struct klist_waiter阻塞在klist_remove_waiters上。所以在klist_release()调用时还要将阻塞的线程唤醒。knode_kill()将节点设为已请求删除。而且还会调用put()函数。

释放引用计数是调用klist_del()，它通过内部函数klist_put()完成所需操作：用knode_kill()设置节点为已请求删除，用klist_dec_and_del()释放引用，调用可能的put()函数。

/**
* klist_remove – Decrement the refcount of node and wait for it to go away.
* @n: node we’re removing.
*/
void klist_remove(struct klist_node *n)
{
struct klist_waiter waiter;
waiter.node = n;
waiter.process = current;
waiter.woken = 0;
spin_lock(&klist_remove_lock);
list_add(&waiter.list, &klist_remove_waiters);
spin_unlock(&klist_remove_lock);
klist_del(n);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (waiter.woken)
break;
schedule();
}
__set_current_state(TASK_RUNNING);
}

klist_remove()不但会调用klist_del()减少引用计数，还会一直阻塞到节点被删除。这个函数才是请求删除节点的线程应该调用的。

int klist_node_attached(struct klist_node *n)
{
return (n->n_klist != NULL);
}

klist_node_attached()检查节点是否被包含在某链表中。

以上是klist的链表初始化，节点加入，节点删除函数。下面是klist链表遍历函数。

struct klist_iter {
struct klist *i_klist;
struct klist_node *i_cur;
};
extern void klist_iter_init(struct klist *k, struct klist_iter *i);
extern void klist_iter_init_node(struct klist *k, struct klist_iter *i,
struct klist_node *n);
extern void klist_iter_exit(struct klist_iter *i);
extern struct klist_node *klist_next(struct klist_iter *i);

klist的遍历有些复杂，因为它考虑到了在遍历过程中节点删除的情况，而且还要忽略那些已被删除的节点。宏实现已经无法满足要求，迫不得已，只能用函数实现，并用struct klist_iter记录中间状态。

void klist_iter_init_node(struct klist *k, struct klist_iter *i,
struct klist_node *n)
{
i->i_klist = k;
i->i_cur = n;
if (n)
kref_get(&n->n_ref);
}
void klist_iter_init(struct klist *k, struct klist_iter *i)
{
klist_iter_init_node(k, i, NULL);
}

klist_iter_init_node()是从klist中的某个节点开始遍历，而klist_iter_init()是从链表头开始遍历的。

但你又要注意，klist_iter_init()和klist_iter_init_node()的用法又不同。klist_iter_init_node()可以在其后直接对当前节点进行访问，也可以调用klist_next()访问下一节点。而klist_iter_init()只能调用klist_next()访问下一节点。或许klist_iter_init_node()的本意不是从当前节点开始，而是从当前节点的下一节点开始。

static struct klist_node *to_klist_node(struct list_head *n)
{
return container_of(n, struct klist_node, n_node);
}

struct klist_node *klist_next(struct klist_iter *i)
{
void (*put)(struct klist_node *) = i->i_klist->put;
struct klist_node *last = i->i_cur;
struct klist_node *next;
spin_lock(&i->i_klist->k_lock);
if (last) {
next = to_klist_node(last->n_node.next);
if (!klist_dec_and_del(last))
put = NULL;
} else
next = to_klist_node(i->i_klist->k_list.next);
i->i_cur = NULL;
while (next != to_klist_node(&i->i_klist->k_list)) {
if (likely(!knode_dead(next))) {
kref_get(&next->n_ref);
i->i_cur = next;
break;
}
next = to_klist_node(next->n_node.next);
}
spin_unlock(&i->i_klist->k_lock);
if (put && last)
put(last);
return i->i_cur;
}

klist_next()是将循环进行到下一节点。实现中需要注意两点问题：1、加锁，根据经验，单纯对某个节点操作不需要加锁，但对影响整个链表的操作需要加自旋锁。比如之前klist_iter_init_node()中对节点增加引用计数，就不需要加锁，因为只有已经拥有节点引用计数的线程才会特别地从那个节点开始。而之后klist_next()中则需要加锁，因为当前线程很可能没有引用计数，所以需要加锁，让情况固定下来。这既是保护链表，也是保护节点有效。符合kref引用计数的使用原则。2、要注意，虽然在节点切换的过程中是加锁的，但切换完访问当前节点时是解锁的，中间可能有节点被删除（这个通过spin_lock就可以搞定），也可能有节点被请求删除，这就需要注意。首先要忽略链表中已被请求删除的节点，然后在减少前一个节点引用计数时，可能就把前一个节点删除了。这里之所以不调用klist_put()，是因为本身已处于加锁状态，但仍要有它的实现。这里的实现和klist_put()中类似，代码不介意在加锁状态下唤醒另一个线程，但却不希望在加锁状态下调用put()函数，那可能会涉及释放另一个更大的结构。

void klist_iter_exit(struct klist_iter *i)
{
if (i->i_cur) {
klist_put(i->i_cur, false);
i->i_cur = NULL;
}
}

klist_iter_exit()，遍历结束函数。在遍历完成时调不调无所谓，但如果想中途结束，就一定要调用klist_iter_exit()。

klist主要用于设备驱动模型中，为了适应那些动态变化的设备和驱动，而专门设计的链表。klist并不通用，但它真的很新奇。我看到它时，震惊于链表竟然可以专门异化成这种样子。如果你是松耦合的结构，如果你手下净是些桀骜不驯的家伙，那么不要只考虑kref，你可能还需要klist。

之前我们分析了引用计数kref，总结了sysfs提供的API，并翻译了介绍kobject原理及用法的文档。应该说准备工作做得足够多，kobject的实现怎么都可以看懂了，甚至只需要总结下API就行了。可我还是决定把kobject的实现代码从头分析一遍。一是因为kobject的代码很重要，会在设备驱动模型代码中无数次被用到，如果不熟悉的话可以说是举步维艰。二是为了熟悉linux的编码风格，为以后分析更大规模的代码奠定基础。

kobject的头文件在include/linux/kobject.h，实现在lib/kobject.c。闲话少说，上代码。

struct kobject {

const char *name;

struct list_head entry;

struct kobject *parent;

struct kset *kset;

struct kobj_type *ktype;

struct sysfs_dirent *sd;

struct kref kref;

unsigned int state_initialized:1;

unsigned int state_in_sysfs:1;

unsigned int state_add_uevent_sent:1;

unsigned int state_remove_uevent_sent:1;

unsigned int uevent_suppress:1;

};

在struct kobject中，name是名字，entry是用于kobject所属kset下的子kobject链表，parent指向kobject的父节点，kset指向kobject所属的kset，ktype定义了kobject所属的类型，sd指向kobject对应的sysfs目录，kref记录kobject的引用计数，之后是一系列标志。

struct kobj_type {

void (*release)(struct kobject *kobj);

struct sysfs_ops *sysfs_ops;

struct attribute **default_attrs;

};

struct kobj_type就是定义了kobject的公共类型，其中既有操作的函数，也有公共的属性。其中release()是在kobject释放时调用的，sysfs_ops中定义了读写属性文件时调用的函数。default_attrs中定义了这类kobject公共的属性。

struct kset {

struct list_head list;

spinlock_t list_lock;

struct kobject kobj;

struct kset_uevent_ops *uevent_ops;

};

struct kset可以看成在kobject上的扩展，它包含一个kobject的链表，可以方便地表示sysfs中目录与子目录的关系。其中，list是所属kobject的链表头，list_lock用于在访问链表时加锁，kobj是kset的内部kobject，要表现为sysfs中的目录就必须拥有kobject的功能，最后的kset_uevent_ops定义了对发往用户空间的uevent的处理。我对uevent不了解，会尽量忽略。

struct kobj_attribute {

struct attribute attr;

ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr,

char *buf);

ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr,

const char *buf, size_t count);

};

struct kobj_attribute是kobject在attribute上做出的扩展，添加了两个专门读写kobject属性的函数。无论是kobject，还是kset（说到底是kset内部的kobject），都提供了使用kobj_attribute的快速创建方法。

结构差不多介绍完了，下面看看实现。我所知道的代码分析风格，喜欢自顶向下的方式，从一个函数开始，介绍出一个函数调用树。在代码量很大，涉及调用层次很深的时候，确实要采用这种打洞的方式来寻找突破口。但这种自顶向下的方式有两个问题：一是很容易迷失，二是代码分析的难度会逐渐增大而不是减小。在茫茫的代码中，你一头下去，周围都是你不认识的函数，一个函数里调用了三个陌生的函数，其中一个陌生的函数又调用了五个更陌生的函数…不久你就会产生很强的挫败感。这就像走在沙漠上，你不知道终点在哪，也许翻过一个沙丘就到了，也许还有无数个沙丘。而且在这种分析时，人是逐渐走向细节，容易被细节所困扰，忽略了整体的印象与代码的层次感。所以，我觉得在分析代码时，也可以采用自底向上的方式，从细小的、内部使用的函数，到比较宏观的、供外部调用的函数。而且按照这种顺序来看代码，基本就是文件从头读到尾的顺序，也比较符合写代码的流程。linux代码喜欢在文件开始处攒内部静态函数，攒到一定程度爆发，突然实现几个外部API，然后再攒，再实现。而且之前的内部静态函数会反复调用到。linux代码写得很有层次感，除了内外有别，还把意思相近的，或者功能刚好相反的，或者使用时顺序调用的函数放在一起，很便于阅读。闲话少说，等你看完kobject的实现自然就清楚了。

static int populate_dir(struct kobject *kobj)

{

struct kobj_type *t = get_ktype(kobj);

struct attribute *attr;

int error = 0;

int i;

if (t && t->default_attrs) {

for (i = 0; (attr = t->default_attrs[i]) != NULL; i++) {

error = sysfs_create_file(kobj, attr);

if (error)

break;

}

}

return error;

}

static int create_dir(struct kobject *kobj)

{

int error = 0;

if (kobject_name(kobj)) {

error = sysfs_create_dir(kobj);

if (!error) {

error = populate_dir(kobj);

if (error)

sysfs_remove_dir(kobj);

}

}

return error;

}

create_dir()在sysfs中创建kobj对应的目录，populate_dir()创建kobj中默认属性对应的文件。create_dir()正是调用populate_dir()实现的。

static int get_kobj_path_length(struct kobject *kobj)

{

int length = 1;

struct kobject *parent = kobj;

/* walk up the ancestors until we hit the one pointing to the

* root.

* Add 1 to strlen for leading ‘/’ of each level.

do {

if (kobject_name(parent) == NULL)

return 0;

length += strlen(kobject_name(parent)) + 1;

parent = parent->parent;

} while (parent);

return length;

}

static void fill_kobj_path(struct kobject *kobj, char *path, int length)

{

struct kobject *parent;

–length;

for (parent = kobj; parent; parent = parent->parent) {

int cur = strlen(kobject_name(parent));

/* back up enough to print this name with ‘/’ */

length -= cur;

strncpy(path + length, kobject_name(parent), cur);

*(path + –length) = ‘/’;

}

pr_debug(“kobject: ‘%s’ (%p): %s: path = ‘%s’\n”, kobject_name(kobj),

kobj, __func__, path);

}

* kobject_get_path – generate and return the path associated with a given kobj and kset pair.

* @kobj: kobject in question, with which to build the path

* @gfp_mask: the allocation type used to allocate the path

* The result must be freed by the caller with kfree().

char *kobject_get_path(struct kobject *kobj, gfp_t gfp_mask)

{

char *path;

int len;

len = get_kobj_path_length(kobj);

if (len == 0)

return NULL;

path = kzalloc(len, gfp_mask);

if (!path)

return NULL;

fill_kobj_path(kobj, path, len);

return path;

}

前面两个是内部函数，get_kobj_path_length()获得kobj路径名的长度，fill_kobj_path()把kobj路径名填充到path缓冲区中。

kobject_get_path()靠两个函数获得kobj的路径名，从攒函数到爆发一气呵成。

static void kobj_kset_join(struct kobject *kobj)

{

if (!kobj->kset)

return;

kset_get(kobj->kset);

spin_lock(&kobj->kset->list_lock);

list_add_tail(&kobj->entry, &kobj->kset->list);

spin_unlock(&kobj->kset->list_lock);

}

/* remove the kobject from its kset’s list */

static void kobj_kset_leave(struct kobject *kobj)

{

if (!kobj->kset)

return;

spin_lock(&kobj->kset->list_lock);

list_del_init(&kobj->entry);

spin_unlock(&kobj->kset->list_lock);

kset_put(kobj->kset);

}

kobj_kset_join()把kobj加入kobj->kset的链表中，kobj_kset_leave()把kobj从kobj->kset的链表中去除，两者功能相对。

static void kobject_init_internal(struct kobject *kobj)

{

if (!kobj)

return;

kref_init(&kobj->kref);

INIT_LIST_HEAD(&kobj->entry);

kobj->state_in_sysfs = 0;

kobj->state_add_uevent_sent = 0;

kobj->state_remove_uevent_sent = 0;

kobj->state_initialized = 1;

}

static int kobject_add_internal(struct kobject *kobj)

{

int error = 0;

struct kobject *parent;

if (!kobj)

return -ENOENT;

if (!kobj->name || !kobj->name[0]) {

WARN(1, “kobject: (%p): attempted to be registered with empty “

“name!\n”, kobj);

return -EINVAL;

}

parent = kobject_get(kobj->parent);

/* join kset if set, use it as parent if we do not already have one */

if (kobj->kset) {

if (!parent)

parent = kobject_get(&kobj->kset->kobj);

kobj_kset_join(kobj);

kobj->parent = parent;

}

pr_debug(“kobject: ‘%s’ (%p): %s: parent: ‘%s’, set: ‘%s’\n”,

kobject_name(kobj), kobj, __func__,

parent ? kobject_name(parent) : “<NULL>”,

kobj->kset ? kobject_name(&kobj->kset->kobj) : “<NULL>”);

error = create_dir(kobj);

if (error) {

kobj_kset_leave(kobj);

kobject_put(parent);

kobj->parent = NULL;

/* be noisy on error issues */

if (error == -EEXIST)

printk(KERN_ERR “%s failed for %s with “

“-EEXIST, don’t try to register things with “

“the same name in the same directory.\n”,

__func__, kobject_name(kobj));

else

printk(KERN_ERR “%s failed for %s (%d)\n”,

__func__, kobject_name(kobj), error);

dump_stack();

} else

kobj->state_in_sysfs = 1;

return error;

}

kobject_init_internal()初始化kobj。

kobject_add_internal()把kobj加入已有的结构。

这两个函数看似无关，实际很有关系。在kobject中有好几个结构变量，但重要的只有两个，一个是kset，一个是parent。这两个都是表示当前kobject在整个体系中的位置，决不能自行决定，需要外部参与设置。那把kobject创建的过程分为init和add两个阶段也就很好理解了。kobject_init_internal()把一些能自动初始化的结构变量初始化掉，等外界设置了parent和kset，再调用kobject_add_internal()把kobject安在适当的位置，并创建相应的sysfs目录及文件。

int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,

va_list vargs)

{

const char *old_name = kobj->name;

char *s;

if (kobj->name && !fmt)

return 0;

kobj->name = kvasprintf(GFP_KERNEL, fmt, vargs);

if (!kobj->name)

return -ENOMEM;

/* ewww… some of these buggers have ‘/’ in the name … */

while ((s = strchr(kobj->name, ‘/’)))

s[0] = ‘!’;

kfree(old_name);

return 0;

}

* kobject_set_name – Set the name of a kobject

* @kobj: struct kobject to set the name of

* @fmt: format string used to build the name

* This sets the name of the kobject. If you have already added the

* kobject to the system, you must call kobject_rename() in order to

* change the name of the kobject.

int kobject_set_name(struct kobject *kobj, const char *fmt, …)

{

va_list vargs;

int retval;

va_start(vargs, fmt);

retval = kobject_set_name_vargs(kobj, fmt, vargs);

va_end(vargs);

return retval;

}

kobject_set_name()是设置kobj名称的，它又调用kobject_set_name_vargs()实现。但要注意，这个kobject_set_name()仅限于kobject添加到体系之前，因为它只是修改了名字，并未通知用户空间。

void kobject_init(struct kobject *kobj, struct kobj_type *ktype)

{

char *err_str;

if (!kobj) {

err_str = “invalid kobject pointer!”;

goto error;

}

if (!ktype) {

err_str = “must have a ktype to be initialized properly!\n”;

goto error;

}

if (kobj->state_initialized) {

/* do not error out as sometimes we can recover */

printk(KERN_ERR “kobject (%p): tried to init an initialized “

“object, something is seriously wrong.\n”, kobj);

dump_stack();

}

kobject_init_internal(kobj);

kobj->ktype = ktype;

return;

error:

printk(KERN_ERR “kobject (%p): %s\n”, kobj, err_str);

dump_stack();

}

kobject_init()就是调用kobject_init_internal()自动初始化了一些结构变量，然后又设置了ktype。其实这个ktype主要是管理一些默认属性什么的，只要在kobject_add_internal()调用create_dir()之前设置就行，之所以会出现在kobject_init()中，完全是为了与后面的kobject_create()相对比。

static int kobject_add_varg(struct kobject *kobj, struct kobject *parent,

const char *fmt, va_list vargs)

{

int retval;

retval = kobject_set_name_vargs(kobj, fmt, vargs);

if (retval) {

printk(KERN_ERR “kobject: can not set name properly!\n”);

return retval;

}

kobj->parent = parent;

return kobject_add_internal(kobj);

}

* kobject_add – the main kobject add function

* @kobj: the kobject to add

* @parent: pointer to the parent of the kobject.

* @fmt: format to name the kobject with.

* The kobject name is set and added to the kobject hierarchy in this

* function.

* If @parent is set, then the parent of the @kobj will be set to it.

* If @parent is NULL, then the parent of the @kobj will be set to the

* kobject associted with the kset assigned to this kobject. If no kset

* is assigned to the kobject, then the kobject will be located in the

* root of the sysfs tree.

* If this function returns an error, kobject_put() must be called to

* properly clean up the memory associated with the object.

* Under no instance should the kobject that is passed to this function

* be directly freed with a call to kfree(), that can leak memory.

* Note, no “add” uevent will be created with this call, the caller should set

* up all of the necessary sysfs files for the object and then call

* kobject_uevent() with the UEVENT_ADD parameter to ensure that

* userspace is properly notified of this kobject’s creation.

int kobject_add(struct kobject *kobj, struct kobject *parent,

const char *fmt, …)

{

va_list args;

int retval;

if (!kobj)

return -EINVAL;

if (!kobj->state_initialized) {

printk(KERN_ERR “kobject ‘%s’ (%p): tried to add an “

“uninitialized object, something is seriously wrong.\n”,

kobject_name(kobj), kobj);

dump_stack();

return -EINVAL;

}

va_start(args, fmt);

retval = kobject_add_varg(kobj, parent, fmt, args);

va_end(args);

return retval;

}

kobject_add()把kobj添加到体系中。但它还有一个附加功能，设置kobj的名字。parent也是作为参数传进来的，至于为什么kset没有同样传进来，或许是历史遗留原因吧。

int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype,

struct kobject *parent, const char *fmt, …)

{

va_list args;

int retval;

kobject_init(kobj, ktype);

va_start(args, fmt);

retval = kobject_add_varg(kobj, parent, fmt, args);

va_end(args);

return retval;

}

kobject_init_and_add()虽然是kobject_init()和kobject_add()的合并，但并不常用，因为其中根本没留下设置kset的空挡，这无疑不太合适。

int kobject_rename(struct kobject *kobj, const char *new_name)

{

int error = 0;

const char *devpath = NULL;

const char *dup_name = NULL, *name;

char *devpath_string = NULL;

char *envp[2];

kobj = kobject_get(kobj);

if (!kobj)

return -EINVAL;

if (!kobj->parent)

return -EINVAL;

devpath = kobject_get_path(kobj, GFP_KERNEL);

if (!devpath) {

error = -ENOMEM;

goto out;

}

devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);

if (!devpath_string) {

error = -ENOMEM;

goto out;

}

sprintf(devpath_string, “DEVPATH_OLD=%s”, devpath);

envp[0] = devpath_string;

envp[1] = NULL;

name = dup_name = kstrdup(new_name, GFP_KERNEL);

if (!name) {

error = -ENOMEM;

goto out;

}

error = sysfs_rename_dir(kobj, new_name);

if (error)

goto out;

/* Install the new kobject name */

dup_name = kobj->name;

kobj->name = name;

/* This function is mostly/only used for network interface.

* Some hotplug package track interfaces by their name and

* therefore want to know when the name is changed by the user. */

kobject_uevent_env(kobj, KOBJ_MOVE, envp);

out:

kfree(dup_name);

kfree(devpath_string);

kfree(devpath);

kobject_put(kobj);

return error;

}

kobject_rename()就是在kobj已经添加到系统之后，要改名字时调用的函数。它除了完成kobject_set_name()的功能，还向用户空间通知这一消息。

int kobject_move(struct kobject *kobj, struct kobject *new_parent)

{

int error;

struct kobject *old_parent;

const char *devpath = NULL;

char *devpath_string = NULL;

char *envp[2];

kobj = kobject_get(kobj);

if (!kobj)

return -EINVAL;

new_parent = kobject_get(new_parent);

if (!new_parent) {

if (kobj->kset)

new_parent = kobject_get(&kobj->kset->kobj);

}

/* old object path */

devpath = kobject_get_path(kobj, GFP_KERNEL);

if (!devpath) {

error = -ENOMEM;

goto out;

}

devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);

if (!devpath_string) {

error = -ENOMEM;

goto out;

}

sprintf(devpath_string, “DEVPATH_OLD=%s”, devpath);

envp[0] = devpath_string;

envp[1] = NULL;

error = sysfs_move_dir(kobj, new_parent);

if (error)

goto out;

old_parent = kobj->parent;

kobj->parent = new_parent;

new_parent = NULL;

kobject_put(old_parent);

kobject_uevent_env(kobj, KOBJ_MOVE, envp);

out:

kobject_put(new_parent);

kobject_put(kobj);

kfree(devpath_string);

kfree(devpath);

return error;

}

kobject_move()则是在kobj添加到系统后，想移动到新的parent kobject下所调用的函数。在通知用户空间上，与kobject_rename()调用的是同一操作。

void kobject_del(struct kobject *kobj)

{

if (!kobj)

return;

sysfs_remove_dir(kobj);

kobj->state_in_sysfs = 0;

kobj_kset_leave(kobj);

kobject_put(kobj->parent);

kobj->parent = NULL;

}

kobject_del()仅仅是把kobj从系统中退出，相对于kobject_add()操作。

* kobject_get – increment refcount for object.

* @kobj: object.

struct kobject *kobject_get(struct kobject *kobj)

{

if (kobj)

kref_get(&kobj->kref);

return kobj;

}

* kobject_cleanup – free kobject resources.

* @kobj: object to cleanup

static void kobject_cleanup(struct kobject *kobj)

{

struct kobj_type *t = get_ktype(kobj);

const char *name = kobj->name;

pr_debug(“kobject: ‘%s’ (%p): %s\n”,

kobject_name(kobj), kobj, __func__);

if (t && !t->release)

pr_debug(“kobject: ‘%s’ (%p): does not have a release() “

“function, it is broken and must be fixed.\n”,

kobject_name(kobj), kobj);

/* send “remove” if the caller did not do it but sent “add” */

if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) {

pr_debug(“kobject: ‘%s’ (%p): auto cleanup ‘remove’ event\n”,

kobject_name(kobj), kobj);

kobject_uevent(kobj, KOBJ_REMOVE);

}

/* remove from sysfs if the caller did not do it */

if (kobj->state_in_sysfs) {

pr_debug(“kobject: ‘%s’ (%p): auto cleanup kobject_del\n”,

kobject_name(kobj), kobj);

kobject_del(kobj);

}

if (t && t->release) {

pr_debug(“kobject: ‘%s’ (%p): calling ktype release\n”,

kobject_name(kobj), kobj);

t->release(kobj);

}

/* free name if we allocated it */

if (name) {

pr_debug(“kobject: ‘%s’: free name\n”, name);

kfree(name);

}

}

static void kobject_release(struct kref *kref)

{

kobject_cleanup(container_of(kref, struct kobject, kref));

}

* kobject_put – decrement refcount for object.

* @kobj: object.

* Decrement the refcount, and if 0, call kobject_cleanup().

void kobject_put(struct kobject *kobj)

{

if (kobj) {

if (!kobj->state_initialized)

WARN(1, KERN_WARNING “kobject: ‘%s’ (%p): is not “

“initialized, yet kobject_put() is being “

“called.\n”, kobject_name(kobj), kobj);

kref_put(&kobj->kref, kobject_release);

}

}

kobject_get()和kobject_put()走的完全是引用计数的路线。kobject_put()会在引用计数降为零时撤销整个kobject的存在：向用户空间发生REMOVE消息，从sysfs中删除相应目录，调用kobj_type中定义的release函数，释放name所占的空间。

看看前面介绍的API。

int kobject_set_name(struct kobject *kobj, const char *name, …)

__attribute__((format(printf, 2, 3)));

int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,

va_list vargs);

void kobject_init(struct kobject *kobj, struct kobj_type *ktype);

int __must_check kobject_add(struct kobject *kobj,

struct kobject *parent,

const char *fmt, …);

int __must_check kobject_init_and_add(struct kobject *kobj,

struct kobj_type *ktype,

struct kobject *parent,

const char *fmt, …);

void kobject_del(struct kobject *kobj);

int __must_check kobject_rename(struct kobject *, const char *new_name);

int __must_check kobject_move(struct kobject *, struct kobject *);

struct kobject *kobject_get(struct kobject *kobj);

void kobject_put(struct kobject *kobj);

char *kobject_get_path(struct kobject *kobj, gfp_t flag);

基本上概扩了kobject从创建到删除，包括中间改名字，改位置，以及引用计数的变动。

当然，kobject创建仍比较麻烦，因为ktype需要自己写。下面就是kobject提供的一种快速创建方法。

static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr,

char *buf)

{

struct kobj_attribute *kattr;

ssize_t ret = -EIO;

kattr = container_of(attr, struct kobj_attribute, attr);

if (kattr->show)

ret = kattr->show(kobj, kattr, buf);

return ret;

}

static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr,

const char *buf, size_t count)

{

struct kobj_attribute *kattr;

ssize_t ret = -EIO;

kattr = container_of(attr, struct kobj_attribute, attr);

if (kattr->store)

ret = kattr->store(kobj, kattr, buf, count);

return ret;

}

struct sysfs_ops kobj_sysfs_ops = {

.show = kobj_attr_show,

.store = kobj_attr_store,

};

static void dynamic_kobj_release(struct kobject *kobj)

{

pr_debug(“kobject: (%p): %s\n”, kobj, __func__);

kfree(kobj);

}

static struct kobj_type dynamic_kobj_ktype = {

.release = dynamic_kobj_release,

.sysfs_ops = &kobj_sysfs_ops,

};

这个就是kobject自身提供的一种kobj_type，叫做dynamic_kobj_ktype。它没有提供默认的属性，但提供了release函数及访问属性的方法。

struct kobject *kobject_create(void)

{

struct kobject *kobj;

kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);

if (!kobj)

return NULL;

kobject_init(kobj, &dynamic_kobj_ktype);

return kobj;

}

struct kobject *kobject_create_and_add(const char *name, struct kobject *parent)

{

struct kobject *kobj;

int retval;

kobj = kobject_create();

if (!kobj)

return NULL;

retval = kobject_add(kobj, parent, “%s”, name);

if (retval) {

printk(KERN_WARNING “%s: kobject_add error: %d\n”,

__func__, retval);

kobject_put(kobj);

kobj = NULL;

}

return kobj;

}

在kobject_create()及kobject_create_add()中，使用了这种dynamic_kobj_ktype。这是一种很好的偷懒方法。因为release()函数会释放kobj，所以这里的kobj必须是kobject_create()动态创建的。这里的kobject_create()和kobject_init()相对，kobject_create_and_add()和kobject_init_and_add()相对。值得一提的是，这里用kobject_create()和kobject_create_and_add()创建的kobject无法嵌入其它结构，是独立的存在，所以用到的地方很少。

void kset_init(struct kset *k)

{

kobject_init_internal(&k->kobj);

INIT_LIST_HEAD(&k->list);

spin_lock_init(&k->list_lock);

}

kset_init()对kset进行初始化。不过它的界限同kobject差不多。

int kset_register(struct kset *k)

{

int err;

if (!k)

return -EINVAL;

kset_init(k);

err = kobject_add_internal(&k->kobj);

if (err)

return err;

kobject_uevent(&k->kobj, KOBJ_ADD);

return 0;

}

kset_register()最大的特点是简单，它只负责把kset中的kobject连入系统，并发布KOBJ_ADD消息。所以在调用它之前，你要先设置好k->kobj.name、k->kobj.parent、k->kobj.kset。

void kset_unregister(struct kset *k)

{

if (!k)

return;

kobject_put(&k->kobj);

}

kset_unregister()只是简单地释放创建时获得的引用计数。使用引用计数就是这么简单。

struct kobject *kset_find_obj(struct kset *kset, const char *name)

{

struct kobject *k;

struct kobject *ret = NULL;

spin_lock(&kset->list_lock);

list_for_each_entry(k, &kset->list, entry) {

if (kobject_name(k) && !strcmp(kobject_name(k), name)) {

ret = kobject_get(k);

break;

}

}

spin_unlock(&kset->list_lock);

return ret;

}

kset_find_obj()从kset的链表中找到名为name的kobject。这纯粹是一个对外的API。

static void kset_release(struct kobject *kobj)

{

struct kset *kset = container_of(kobj, struct kset, kobj);

pr_debug(“kobject: ‘%s’ (%p): %s\n”,

kobject_name(kobj), kobj, __func__);

kfree(kset);

}

static struct kobj_type kset_ktype = {

.sysfs_ops = &kobj_sysfs_ops,

.release = kset_release,

};

与kobject相对的，kset也提供了一种kobj_type，叫做kset_ktype。

static struct kset *kset_create(const char *name,

struct kset_uevent_ops *uevent_ops,

struct kobject *parent_kobj)

{

struct kset *kset;

int retval;

kset = kzalloc(sizeof(*kset), GFP_KERNEL);

if (!kset)

return NULL;

retval = kobject_set_name(&kset->kobj, name);

if (retval) {

kfree(kset);

return NULL;

}

kset->uevent_ops = uevent_ops;

kset->kobj.parent = parent_kobj;

* The kobject of this kset will have a type of kset_ktype and belong to

* no kset itself. That way we can properly free it when it is

* finished being used.

kset->kobj.ktype = &kset_ktype;

kset->kobj.kset = NULL;

return kset;

}

* kset_create_and_add – create a struct kset dynamically and add it to sysfs

* @name: the name for the kset

* @uevent_ops: a struct kset_uevent_ops for the kset

* @parent_kobj: the parent kobject of this kset, if any.

* This function creates a kset structure dynamically and registers it

* with sysfs. When you are finished with this structure, call

* kset_unregister() and the structure will be dynamically freed when it

* is no longer being used.

* If the kset was not able to be created, NULL will be returned.

struct kset *kset_create_and_add(const char *name,

struct kset_uevent_ops *uevent_ops,

struct kobject *parent_kobj)

{

struct kset *kset;

int error;

kset = kset_create(name, uevent_ops, parent_kobj);

if (!kset)

return NULL;

error = kset_register(kset);

if (error) {

kfree(kset);

return NULL;

}

return kset;

}

kset_create()和kset_create_and_add()就是使用kset_type的快速创建函数。

说实话，使用kobject_create_and_add()的比较少见，但使用 kset_create_and_add()的情形还是见过一些的。比如sysfs中那些顶层的目录，就是单纯的目录，不需要嵌入什么很复杂的结构，用简单的kset_create_and_add()创建就好了。

static inline const char *kobject_name(const struct kobject *kobj)

{

return kobj->name;

}

static inline struct kset *to_kset(struct kobject *kobj)

{

return kobj ? container_of(kobj, struct kset, kobj) : NULL;

}

static inline struct kset *kset_get(struct kset *k)

{

return k ? to_kset(kobject_get(&k->kobj)) : NULL;

}

static inline void kset_put(struct kset *k)

{

kobject_put(&k->kobj);

}

static inline struct kobj_type *get_ktype(struct kobject *kobj)

{

return kobj->ktype;

}

这些是在kobject.h中的内联函数。这里内联函数更多的意思是方便，易于屏蔽内部实现。

以上就是kobject共800余行的代码实现，当然我们忽略了uevent的那部分。

事实证明，自底向上或者顺序的代码分析方法，还是很适合千行左右的代码分析。而且这样分析很全面，容易我们洞察整个模块的意图，从而在理解代码时从较高的抽象角度去看。

linux的设备驱动模型，是建立在sysfs和kobject之上的，由总线、设备、驱动、类所组成的关系结构。从本节开始，我们将对linux这一设备驱动模型进行深入分析。

头文件是include/linux/device.h，实现在drivers/base目录中。本节要分析的，是其中的设备，主要在core.c中。

struct device {

struct device *parent;

struct device_private *p;

struct kobject kobj;

const char *init_name; /* initial name of the device */

struct device_type *type;

struct semaphore sem; /* semaphore to synchronize calls to

* its driver.

struct bus_type *bus; /* type of bus device is on */

struct device_driver *driver; /* which driver has allocated this

device */

void *platform_data; /* Platform specific data, device

core doesn’t touch it */

struct dev_pm_info power;

#ifdef CONFIG_NUMA

int numa_node; /* NUMA node this device is close to */

#endif

u64 *dma_mask; /* dma mask (if dma’able device) */

u64 coherent_dma_mask;/* Like dma_mask, but for

alloc_coherent mappings as

not all hardware supports

64 bit addresses for consistent

allocations such descriptors. */

struct device_dma_parameters *dma_parms;

struct list_head dma_pools; /* dma pools (if dma’ble) */

struct dma_coherent_mem *dma_mem; /* internal for coherent mem

override */

/* arch specific additions */

struct dev_archdata archdata;

dev_t devt; /* dev_t, creates the sysfs “dev” */

spinlock_t devres_lock;

struct list_head devres_head;

struct klist_node knode_class;

struct class *class;

const struct attribute_group **groups; /* optional groups */

void (*release)(struct device *dev);

};

先来分析下struct device的结构变量。首先是指向父节点的指针parent，kobj是内嵌在device中的kobject，用于把它联系到sysfs中。bus是对设备所在总线的指针，driver是对设备所用驱动的指针。还有DMA需要的数据，表示设备号的devt，表示设备资源的devres_head和保护它的devres_lock。指向类的指针class，knode_class是被连入class链表时所用的klist节点。group是设备的属性集合。release应该是设备释放时调用的函数。

struct device_private {

struct klist klist_children;

struct klist_node knode_parent;

struct klist_node knode_driver;

struct klist_node knode_bus;

void *driver_data;

struct device *device;

};

#define to_device_private_parent(obj) \

container_of(obj, struct device_private, knode_parent)

#define to_device_private_driver(obj) \

container_of(obj, struct device_private, knode_driver)

#define to_device_private_bus(obj) \

container_of(obj, struct device_private, knode_bus)

struct device中有一部分不愿意让外界看到，所以做出struct device_private结构，包括了设备驱动模型内部的链接。klist_children是子设备的链表，knode_parent是连入父设备的klist_children时所用的节点，knode_driver是连入驱动的设备链表所用的节点，knode_bus是连入总线的设备链表时所用的节点。driver_data用于在设备结构中存放相关的驱动信息，也许是驱动专门为设备建立的结构实例。device则是指向struct device_private所属的device。

下面还有一些宏，to_device_private_parent()是从父设备的klist_children上节点，获得相应的device_private。to_device_private_driver()是从驱动的设备链表上节点，获得对应的device_private。to_device_private_bus()是从总线的设备链表上节点，获得对应的device_private。

或许会奇怪，为什么knode_class没有被移入struct device_private，或许有外部模块需要用到它。

* The type of device, “struct device” is embedded in. A class

* or bus can contain devices of different types

* like “partitions” and “disks”, “mouse” and “event”.

* This identifies the device type and carries type-specific

* information, equivalent to the kobj_type of a kobject.

* If “name” is specified, the uevent will contain it in

* the DEVTYPE variable.

struct device_type {

const char *name;

const struct attribute_group **groups;

int (*uevent)(struct device *dev, struct kobj_uevent_env *env);

char *(*devnode)(struct device *dev, mode_t *mode);

void (*release)(struct device *dev);

const struct dev_pm_ops *pm;

};

device竟然有device_type，类似于与kobject相对的kobj_type，之后我们再看它怎么用。

/* interface for exporting device attributes */

struct device_attribute {

struct attribute attr;

ssize_t (*show)(struct device *dev, struct device_attribute *attr,

char *buf);

ssize_t (*store)(struct device *dev, struct device_attribute *attr,

const char *buf, size_t count);

};

#define DEVICE_ATTR(_name, _mode, _show, _store) \

struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store)

这个device_attribute显然就是device对struct attribute的封装，新加的show()、store()函数都是以与设备相关的结构调用的。

至于device中其它的archdata、dma、devres，都是作为设备特有的，我们现在主要关心设备驱动模型的建立，这些会尽量忽略。

下面就来看看device的实现，这主要在core.c中。

int __init devices_init(void)

{

devices_kset = kset_create_and_add(“devices”, &device_uevent_ops, NULL);

if (!devices_kset)

return -ENOMEM;

dev_kobj = kobject_create_and_add(“dev”, NULL);

if (!dev_kobj)

goto dev_kobj_err;

sysfs_dev_block_kobj = kobject_create_and_add(“block”, dev_kobj);

if (!sysfs_dev_block_kobj)

goto block_kobj_err;

sysfs_dev_char_kobj = kobject_create_and_add(“char”, dev_kobj);

if (!sysfs_dev_char_kobj)

goto char_kobj_err;

return 0;

char_kobj_err:

kobject_put(sysfs_dev_block_kobj);

block_kobj_err:

kobject_put(dev_kobj);

dev_kobj_err:

kset_unregister(devices_kset);

return -ENOMEM;

}

这是在设备驱动模型初始化时调用的device部分初始的函数devices_init()。它干的事情我们都很熟悉，就是建立sysfs中的devices目录，和dev目录。还在dev目录下又建立了block和char两个子目录。因为dev目录只打算存放辅助的设备号，所以没必要使用kset。

static ssize_t dev_attr_show(struct kobject *kobj, struct attribute *attr,

char *buf)

{

struct device_attribute *dev_attr = to_dev_attr(attr);

struct device *dev = to_dev(kobj);

ssize_t ret = -EIO;

if (dev_attr->show)

ret = dev_attr->show(dev, dev_attr, buf);

if (ret >= (ssize_t)PAGE_SIZE) {

print_symbol(“dev_attr_show: %s returned bad count\n”,

(unsigned long)dev_attr->show);

}

return ret;

}

static ssize_t dev_attr_store(struct kobject *kobj, struct attribute *attr,

const char *buf, size_t count)

{

struct device_attribute *dev_attr = to_dev_attr(attr);

struct device *dev = to_dev(kobj);

ssize_t ret = -EIO;

if (dev_attr->store)

ret = dev_attr->store(dev, dev_attr, buf, count);

return ret;

}

static struct sysfs_ops dev_sysfs_ops = {

.show = dev_attr_show,

.store = dev_attr_store,

};

看到这里是不是很熟悉，dev_sysfs_ops就是device准备注册到sysfs中的操作函数。dev_attr_show()和dev_attr_store()都会再调用与属性相关的函数。

static void device_release(struct kobject *kobj)

{

struct device *dev = to_dev(kobj);

struct device_private *p = dev->p;

if (dev->release)

dev->release(dev);

else if (dev->type && dev->type->release)

dev->type->release(dev);

else if (dev->class && dev->class->dev_release)

dev->class->dev_release(dev);

else

WARN(1, KERN_ERR “Device ‘%s’ does not have a release() “

“function, it is broken and must be fixed.\n”,

dev_name(dev));

kfree(p);

}

static struct kobj_type device_ktype = {

.release = device_release,

.sysfs_ops = &dev_sysfs_ops,

};

使用的release函数是device_release。在释放device时，会依次调用device结构中定义的release函数，device_type中定义的release函数，device所属的class中所定义的release函数，最后会吧device_private结构释放掉。

static int dev_uevent_filter(struct kset *kset, struct kobject *kobj)

{

struct kobj_type *ktype = get_ktype(kobj);

if (ktype == &device_ktype) {

struct device *dev = to_dev(kobj);

if (dev->bus)

return 1;

if (dev->class)

return 1;

}

return 0;

}

static const char *dev_uevent_name(struct kset *kset, struct kobject *kobj)

{

struct device *dev = to_dev(kobj);

if (dev->bus)

return dev->bus->name;

if (dev->class)

return dev->class->name;

return NULL;

}

static int dev_uevent(struct kset *kset, struct kobject *kobj,

struct kobj_uevent_env *env)

{

struct device *dev = to_dev(kobj);

int retval = 0;

/* add device node properties if present */

if (MAJOR(dev->devt)) {

const char *tmp;

const char *name;

mode_t mode = 0;

add_uevent_var(env, “MAJOR=%u”, MAJOR(dev->devt));

add_uevent_var(env, “MINOR=%u”, MINOR(dev->devt));

name = device_get_devnode(dev, &mode, &tmp);

if (name) {

add_uevent_var(env, “DEVNAME=%s”, name);

kfree(tmp);

if (mode)

add_uevent_var(env, “DEVMODE=%#o”, mode & 0777);

}

}

if (dev->type && dev->type->name)

add_uevent_var(env, “DEVTYPE=%s”, dev->type->name);

if (dev->driver)

add_uevent_var(env, “DRIVER=%s”, dev->driver->name);

#ifdef CONFIG_SYSFS_DEPRECATED

if (dev->class) {

struct device *parent = dev->parent;

/* find first bus device in parent chain */

while (parent && !parent->bus)

parent = parent->parent;

if (parent && parent->bus) {

const char *path;

path = kobject_get_path(&parent->kobj, GFP_KERNEL);

if (path) {

add_uevent_var(env, “PHYSDEVPATH=%s”, path);

kfree(path);

}

add_uevent_var(env, “PHYSDEVBUS=%s”, parent->bus->name);

if (parent->driver)

add_uevent_var(env, “PHYSDEVDRIVER=%s”,

parent->driver->name);

}

} else if (dev->bus) {

add_uevent_var(env, “PHYSDEVBUS=%s”, dev->bus->name);

if (dev->driver)

add_uevent_var(env, “PHYSDEVDRIVER=%s”,

dev->driver->name);

}

#endif

/* have the bus specific function add its stuff */

if (dev->bus && dev->bus->uevent) {

retval = dev->bus->uevent(dev, env);

if (retval)

pr_debug(“device: ‘%s’: %s: bus uevent() returned %d\n”,

dev_name(dev), __func__, retval);

}

/* have the class specific function add its stuff */

if (dev->class && dev->class->dev_uevent) {

retval = dev->class->dev_uevent(dev, env);

if (retval)

pr_debug(“device: ‘%s’: %s: class uevent() “

“returned %d\n”, dev_name(dev),

__func__, retval);

}

/* have the device type specific fuction add its stuff */

if (dev->type && dev->type->uevent) {

retval = dev->type->uevent(dev, env);

if (retval)

pr_debug(“device: ‘%s’: %s: dev_type uevent() “

“returned %d\n”, dev_name(dev),

__func__, retval);

}

return retval;

}

static struct kset_uevent_ops device_uevent_ops = {

.filter = dev_uevent_filter,

.name = dev_uevent_name,

.uevent = dev_uevent,

};

前面在讲到kset时，我们并未关注其中的kset_event_ops结构变量。但这里device既然用到了，我们就对其中的三个函数做简单介绍。kset_uevent_ops中的函数是用于管理kset内部kobject的uevent操作。其中filter函数用于阻止一个kobject向用户空间发送uevent，返回值为0表示阻止。这里dev_uevent_filter()检查device所属的bus或者class是否存在，如果都不存在，也就没有发送uevent的必要了。name函数是用于覆盖kset发送给用户空间的名称。这里dev_uevent_name()选择使用bus或者class的名称。uevent()函数是在uevent将被发送到用户空间之前调用的，用于向uevent中增加新的环境变量。dev_uevent()的实现很热闹，向uevent中添加了各种环境变量。

static ssize_t show_uevent(struct device *dev, struct device_attribute *attr,

char *buf)

{

struct kobject *top_kobj;

struct kset *kset;

struct kobj_uevent_env *env = NULL;

int i;

size_t count = 0;

int retval;

/* search the kset, the device belongs to */

top_kobj = &dev->kobj;

while (!top_kobj->kset && top_kobj->parent)

top_kobj = top_kobj->parent;

if (!top_kobj->kset)

goto out;

kset = top_kobj->kset;

if (!kset->uevent_ops || !kset->uevent_ops->uevent)

goto out;

/* respect filter */

if (kset->uevent_ops && kset->uevent_ops->filter)

if (!kset->uevent_ops->filter(kset, &dev->kobj))

goto out;

env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);

if (!env)

return -ENOMEM;

/* let the kset specific function add its keys */

retval = kset->uevent_ops->uevent(kset, &dev->kobj, env);

if (retval)

goto out;

/* copy keys to file */

for (i = 0; i < env->envp_idx; i++)

count += sprintf(&buf[count], “%s\n”, env->envp[i]);

out:

kfree(env);

return count;

}

static ssize_t store_uevent(struct device *dev, struct device_attribute *attr,

const char *buf, size_t count)

{

enum kobject_action action;

if (kobject_action_type(buf, count, &action) == 0) {

kobject_uevent(&dev->kobj, action);

goto out;

}

dev_err(dev, “uevent: unsupported action-string; this will “

“be ignored in a future kernel version\n”);

kobject_uevent(&dev->kobj, KOBJ_ADD);

out:

return count;

}

static struct device_attribute uevent_attr =

__ATTR(uevent, S_IRUGO | S_IWUSR, show_uevent, store_uevent);

device不仅在kset中添加了对uevent的管理，而且还把uevent信息做成设备的一个属性uevent。其中show_event()是显示uevent中环境变量的，store_uevent()是发送uevent的。

static int device_add_attributes(struct device *dev,

struct device_attribute *attrs)

{

int error = 0;

int i;

if (attrs) {

for (i = 0; attr_name(attrs[i]); i++) {

error = device_create_file(dev, &attrs[i]);

if (error)

break;

}

if (error)

while (–i >= 0)

device_remove_file(dev, &attrs[i]);

}

return error;

}

static void device_remove_attributes(struct device *dev,

struct device_attribute *attrs)

{

int i;

if (attrs)

for (i = 0; attr_name(attrs[i]); i++)

device_remove_file(dev, &attrs[i]);

}

static int device_add_groups(struct device *dev,

const struct attribute_group **groups)

{

int error = 0;

int i;

if (groups) {

for (i = 0; groups[i]; i++) {

error = sysfs_create_group(&dev->kobj, groups[i]);

if (error) {

while (–i >= 0)

sysfs_remove_group(&dev->kobj,

groups[i]);

break;

}

}

}

return error;

}

static void device_remove_groups(struct device *dev,

const struct attribute_group **groups)

{

int i;

if (groups)

for (i = 0; groups[i]; i++)

sysfs_remove_group(&dev->kobj, groups[i]);

}

以上四个内部函数是用来向device中添加或删除属性与属性集合的。

device_add_attributes、device_remove_attributes、device_add_groups、device_remove_groups，都是直接通过sysfs提供的API实现。

static int device_add_attrs(struct device *dev)

{

struct class *class = dev->class;

struct device_type *type = dev->type;

int error;

if (class) {

error = device_add_attributes(dev, class->dev_attrs);

if (error)

return error;

}

if (type) {

error = device_add_groups(dev, type->groups);

if (error)

goto err_remove_class_attrs;

}

error = device_add_groups(dev, dev->groups);

if (error)

goto err_remove_type_groups;

return 0;

err_remove_type_groups:

if (type)

device_remove_groups(dev, type->groups);

err_remove_class_attrs:

if (class)

device_remove_attributes(dev, class->dev_attrs);

return error;

}

static void device_remove_attrs(struct device *dev)

{

struct class *class = dev->class;

struct device_type *type = dev->type;

device_remove_groups(dev, dev->groups);

if (type)

device_remove_groups(dev, type->groups);

if (class)

device_remove_attributes(dev, class->dev_attrs);

}

device_add_attrs()实际负责device中的属性添加。也是几个部分的集合，包括class中的dev_attrs，device_type中的groups，还有device本身的groups。

device_remove_attrs()则负责对应的device属性删除工作。

#define print_dev_t(buffer, dev) \

sprintf((buffer), “%u:%u\n”, MAJOR(dev), MINOR(dev))

static ssize_t show_dev(struct device *dev, struct device_attribute *attr,

char *buf)

{

return print_dev_t(buf, dev->devt);

}

static struct device_attribute devt_attr =

__ATTR(dev, S_IRUGO, show_dev, NULL);

这里又定义了一个名为dev的属性，就是显示设备的设备号。

* device_create_file – create sysfs attribute file for device.

* @dev: device.

* @attr: device attribute descriptor.

int device_create_file(struct device *dev, struct device_attribute *attr)

{

int error = 0;

if (dev)

error = sysfs_create_file(&dev->kobj, &attr->attr);

return error;

}

* device_remove_file – remove sysfs attribute file.

* @dev: device.

* @attr: device attribute descriptor.

void device_remove_file(struct device *dev, struct device_attribute *attr)

{

if (dev)

sysfs_remove_file(&dev->kobj, &attr->attr);

}

* device_create_bin_file – create sysfs binary attribute file for device.

* @dev: device.

* @attr: device binary attribute descriptor.

int device_create_bin_file(struct device *dev, struct bin_attribute *attr)

{

int error = -EINVAL;

if (dev)

error = sysfs_create_bin_file(&dev->kobj, attr);

return error;

}

* device_remove_bin_file – remove sysfs binary attribute file

* @dev: device.

* @attr: device binary attribute descriptor.

void device_remove_bin_file(struct device *dev, struct bin_attribute *attr)

{

if (dev)

sysfs_remove_bin_file(&dev->kobj, attr);

}

int device_schedule_callback_owner(struct device *dev,

void (*func)(struct device *), struct module *owner)

{

return sysfs_schedule_callback(&dev->kobj,

(void (*)(void *)) func, dev, owner);

}

这里的五个函数，也是对sysfs提供的API的简单封装。

device_create_file()和device_remove_file()提供直接的属性文件管理方法。

device_create_bin_file()和device_remove_bin_file()则是提供设备管理二进制文件的方法。

device_schedule_callback_owner()也是简单地将func加入工作队列。

static void klist_children_get(struct klist_node *n)

{

struct device_private *p = to_device_private_parent(n);

struct device *dev = p->device;

get_device(dev);

}

static void klist_children_put(struct klist_node *n)

{

struct device_private *p = to_device_private_parent(n);

struct device *dev = p->device;

put_device(dev);

}

如果之前认真看过klist的实现，应该知道，klist_children_get()和klist_children_put()就是在设备挂入和删除父设备的klist_children链表时调用的函数。在父设备klist_children链表上的指针，相当于对device的一个引用计数。

struct device *get_device(struct device *dev)

{

return dev ? to_dev(kobject_get(&dev->kobj)) : NULL;

}

* put_device – decrement reference count.

* @dev: device in question.

void put_device(struct device *dev)

{

/* might_sleep(); */

if (dev)

kobject_put(&dev->kobj);

}

device中的引用计数，完全交给内嵌的kobject来做。如果引用计数降为零，自然是调用之前说到的包含甚广的device_release函数。

void device_initialize(struct device *dev)

{

dev->kobj.kset = devices_kset;

kobject_init(&dev->kobj, &device_ktype);

INIT_LIST_HEAD(&dev->dma_pools);

init_MUTEX(&dev->sem);

spin_lock_init(&dev->devres_lock);

INIT_LIST_HEAD(&dev->devres_head);

device_init_wakeup(dev, 0);

device_pm_init(dev);

set_dev_node(dev, -1);

}

device_initialize()就是device结构的初始化函数，它把device中能初始化的部分全初始化。它的界限在其中kobj的位置与device在设备驱动模型中的位置，这些必须由外部设置。可以看到，调用kobject_init()时，object的kobj_type选择了device_ktype，其中主要是sysops的两个函数，还有device_release函数。

static struct kobject *virtual_device_parent(struct device *dev)

{

static struct kobject *virtual_dir = NULL;

if (!virtual_dir)

virtual_dir = kobject_create_and_add(“virtual”,

&devices_kset->kobj);

return virtual_dir;

}

static struct kobject *get_device_parent(struct device *dev,

struct device *parent)

{

int retval;

if (dev->class) {

struct kobject *kobj = NULL;

struct kobject *parent_kobj;

struct kobject *k;

* If we have no parent, we live in “virtual”.

* Class-devices with a non class-device as parent, live

* in a “glue” directory to prevent namespace collisions.

if (parent == NULL)

parent_kobj = virtual_device_parent(dev);

else if (parent->class)

return &parent->kobj;

else

parent_kobj = &parent->kobj;

/* find our class-directory at the parent and reference it */

spin_lock(&dev->class->p->class_dirs.list_lock);

list_for_each_entry(k, &dev->class->p->class_dirs.list, entry)

if (k->parent == parent_kobj) {

kobj = kobject_get(k);

break;

}

spin_unlock(&dev->class->p->class_dirs.list_lock);

if (kobj)

return kobj;

/* or create a new class-directory at the parent device */

k = kobject_create();

if (!k)

return NULL;

k->kset = &dev->class->p->class_dirs;

retval = kobject_add(k, parent_kobj, “%s”, dev->class->name);

if (retval < 0) {

kobject_put(k);

return NULL;

}

/* do not emit an uevent for this simple “glue” directory */

return k;

}

if (parent)

return &parent->kobj;

return NULL;

}

这里的get_device_parent()就是获取父节点的kobject，但也并非就如此简单。get_device_parent()的返回值直接决定了device将被挂在哪个目录下。到底该挂在哪，是由dev->class、dev->parent、dev->parent->class等因素综合决定的。我们看get_device_parent()中是如何判断的。如果dev->class为空，表示一切随父设备，有parent则返回parent->kobj，没有则返回NULL。如果有dev->class呢，情况就比较复杂了，也许device有着与parent不同的class，也许device还没有一个parent，等等。我们看具体的情况。如果parent不为空，而且存在parent->class，则还放在parent目录下。不然，要么parent不存在，要么parent没有class，很难直接将有class的device放在parent下面。目前的解决方法很简单，在parent与device之间，再加一层表示class的目录。如果parent都没有，那就把/sys/devices/virtual当做parent。class->p->class_dirs就是专门存放这种中间kobject的kset。思路理清后，再结合实际的sysfs，代码就很容易看懂了。

static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)

{

/* see if we live in a “glue” directory */

if (!glue_dir || !dev->class ||

glue_dir->kset != &dev->class->p->class_dirs)

return;

kobject_put(glue_dir);

}

static void cleanup_device_parent(struct device *dev)

{

cleanup_glue_dir(dev, dev->kobj.parent);

}

cleanup_device_parent()是取消对parent引用时调用的函数，看起来只针对这种glue形式的目录起作用。

static void setup_parent(struct device *dev, struct device *parent)

{

struct kobject *kobj;

kobj = get_device_parent(dev, parent);

if (kobj)

dev->kobj.parent = kobj;

}

setup_parent()就是调用get_device_parent()获得应该存放的父目录kobj，并把dev->kobj.parent设为它。

static int device_add_class_symlinks(struct device *dev)

{

int error;

if (!dev->class)

return 0;

error = sysfs_create_link(&dev->kobj,

&dev->class->p->class_subsys.kobj,

“subsystem”);

if (error)

goto out;

/* link in the class directory pointing to the device */

error = sysfs_create_link(&dev->class->p->class_subsys.kobj,

&dev->kobj, dev_name(dev));

if (error)

goto out_subsys;

if (dev->parent && device_is_not_partition(dev)) {

error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,

“device”);

if (error)

goto out_busid;

}

return 0;

out_busid:

sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev));

out_subsys:

sysfs_remove_link(&dev->kobj, “subsystem”);

out:

return error;

}

device_add_class_symlinks()在device和class直接添加一些软链接。在device目录下创建指向class的subsystem文件，在class目录下创建指向device的同名文件。如果device有父设备，而且device不是块设备分区，则在device目录下建立一个指向父设备的device链接文件。这一点在usb设备和usb接口间很常见。

static void device_remove_class_symlinks(struct device *dev)

{

if (!dev->class)

return;

#ifdef CONFIG_SYSFS_DEPRECATED

if (dev->parent && device_is_not_partition(dev)) {

char *class_name;

class_name = make_class_name(dev->class->name, &dev->kobj);

if (class_name) {

sysfs_remove_link(&dev->parent->kobj, class_name);

kfree(class_name);

}

sysfs_remove_link(&dev->kobj, “device”);

}

if (dev->kobj.parent != &dev->class->p->class_subsys.kobj &&

device_is_not_partition(dev))

sysfs_remove_link(&dev->class->p->class_subsys.kobj,

dev_name(dev));

#else

if (dev->parent && device_is_not_partition(dev))

sysfs_remove_link(&dev->kobj, “device”);

sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev));

#endif

sysfs_remove_link(&dev->kobj, “subsystem”);

}

device_remove_class_symlinks()删除device和class之间的软链接。

static inline const char *dev_name(const struct device *dev)

{

return kobject_name(&dev->kobj);

}

int dev_set_name(struct device *dev, const char *fmt, …)

{

va_list vargs;

int err;

va_start(vargs, fmt);

err = kobject_set_name_vargs(&dev->kobj, fmt, vargs);

va_end(vargs);

return err;

}

dev_name()获得设备名称，dev_set_name()设置设备名称。但这里的dev_set_name()只能在设备未注册前使用。device的名称其实是完全靠dev->kobj管理的。

static struct kobject *device_to_dev_kobj(struct device *dev)

{

struct kobject *kobj;

if (dev->class)

kobj = dev->class->dev_kobj;

else

kobj = sysfs_dev_char_kobj;

return kobj;

}

device_to_dev_kobj()为dev选择合适的/sys/dev下的kobject，或者是块设备，或者是字符设备，或者没有。

#define format_dev_t(buffer, dev) \

({ \

sprintf(buffer, “%u:%u”, MAJOR(dev), MINOR(dev)); \

buffer; \

})

static int device_create_sys_dev_entry(struct device *dev)

{

struct kobject *kobj = device_to_dev_kobj(dev);

int error = 0;

char devt_str[15];

if (kobj) {

format_dev_t(devt_str, dev->devt);

error = sysfs_create_link(kobj, &dev->kobj, devt_str);

}

return error;

}

static void device_remove_sys_dev_entry(struct device *dev)

{

struct kobject *kobj = device_to_dev_kobj(dev);

char devt_str[15];

if (kobj) {

format_dev_t(devt_str, dev->devt);

sysfs_remove_link(kobj, devt_str);

}

}

device_create_sys_dev_entry()是在/sys/dev相应的目录下建立对设备的软链接。先是通过device_to_dev_kobj()获得父节点的kobj，然后调用sysfs_create_link()建立软链接。

device_remove_sys_dev_entry()与其操作正相反，删除在/sys/dev下建立的软链接。

int device_private_init(struct device *dev)

{

dev->p = kzalloc(sizeof(*dev->p), GFP_KERNEL);

if (!dev->p)

return -ENOMEM;

dev->p->device = dev;

klist_init(&dev->p->klist_children, klist_children_get,

klist_children_put);

return 0;

}

device_private_init()分配并初始化dev->p。至于空间的释放，是等到释放设备时调用的device_release()中。

之前的函数比较散乱，或许找不出一个整体的印象。但下面马上就要看到重要的部分了，因为代码终于攒到了爆发的程度！

* device_register – register a device with the system.

* @dev: pointer to the device structure

* This happens in two clean steps – initialize the device

* and add it to the system. The two steps can be called

* separately, but this is the easiest and most common.

* I.e. you should only call the two helpers separately if

* have a clearly defined need to use and refcount the device

* before it is added to the hierarchy.

* NOTE: _Never_ directly free @dev after calling this function, even

* if it returned an error! Always use put_device() to give up the

* reference initialized in this function instead.

int device_register(struct device *dev)

{

device_initialize(dev);

return device_add(dev);

}

device_register()是提供给外界注册设备的接口。它先是调用device_initialize()初始化dev结构，然后调用device_add()将其加入系统中。但要注意，在调用device_register()注册dev之前，有一些dev结构变量是需要自行设置的。这其中有指明设备位置的struct device *parent,struct bus_type *bus, struct class *class，有指明设备属性的 const char *init_name, struct device_type *type, const struct attribute_group **groups, void (*release)(struct device *dev), dev_t devt，等等。不同设备的使用方法不同，我们留待之后再具体分析。device_initialize()我们已经看过，下面重点看看device_add()是如何实现的。

int device_add(struct device *dev)

{

struct device *parent = NULL;

struct class_interface *class_intf;

int error = -EINVAL;

dev = get_device(dev);

if (!dev)

goto done;

if (!dev->p) {

error = device_private_init(dev);

if (error)

goto done;

}

* for statically allocated devices, which should all be converted

* some day, we need to initialize the name. We prevent reading back

* the name, and force the use of dev_name()

if (dev->init_name) {

dev_set_name(dev, “%s”, dev->init_name);

dev->init_name = NULL;

}

if (!dev_name(dev))

goto name_error;

pr_debug(“device: ‘%s’: %s\n”, dev_name(dev), __func__);

parent = get_device(dev->parent);

setup_parent(dev, parent);

/* use parent numa_node */

if (parent)

set_dev_node(dev, dev_to_node(parent));

/* first, register with generic layer. */

/* we require the name to be set before, and pass NULL */

error = kobject_add(&dev->kobj, dev->kobj.parent, NULL);

if (error)

goto Error;

/* notify platform of device entry */

if (platform_notify)

platform_notify(dev);

error = device_create_file(dev, &uevent_attr);

if (error)

goto attrError;

if (MAJOR(dev->devt)) {

error = device_create_file(dev, &devt_attr);

if (error)

goto ueventattrError;

error = device_create_sys_dev_entry(dev);

if (error)

goto devtattrError;

devtmpfs_create_node(dev);

}

error = device_add_class_symlinks(dev);

if (error)

goto SymlinkError;

error = device_add_attrs(dev);

if (error)

goto AttrsError;

error = bus_add_device(dev);

if (error)

goto BusError;

error = dpm_sysfs_add(dev);

if (error)

goto DPMError;

device_pm_add(dev);

/* Notify clients of device addition. This call must come

* after dpm_sysf_add() and before kobject_uevent().

if (dev->bus)

blocking_notifier_call_chain(&dev->bus->p->bus_notifier,

BUS_NOTIFY_ADD_DEVICE, dev);

kobject_uevent(&dev->kobj, KOBJ_ADD);

bus_probe_device(dev);

if (parent)

klist_add_tail(&dev->p->knode_parent,

&parent->p->klist_children);

if (dev->class) {

mutex_lock(&dev->class->p->class_mutex);

/* tie the class to the device */

klist_add_tail(&dev->knode_class,

&dev->class->p->class_devices);

/* notify any interfaces that the device is here */

list_for_each_entry(class_intf,

&dev->class->p->class_interfaces, node)

if (class_intf->add_dev)

class_intf->add_dev(dev, class_intf);

mutex_unlock(&dev->class->p->class_mutex);

}

done:

put_device(dev);

return error;

DPMError:

bus_remove_device(dev);

BusError:

device_remove_attrs(dev);

AttrsError:

device_remove_class_symlinks(dev);

SymlinkError:

if (MAJOR(dev->devt))

device_remove_sys_dev_entry(dev);

devtattrError:

if (MAJOR(dev->devt))

device_remove_file(dev, &devt_attr);

ueventattrError:

device_remove_file(dev, &uevent_attr);

attrError:

kobject_uevent(&dev->kobj, KOBJ_REMOVE);

kobject_del(&dev->kobj);

Error:

cleanup_device_parent(dev);

if (parent)

put_device(parent);

name_error:

kfree(dev->p);

dev->p = NULL;

goto done;

}

device_add()将dev加入设备驱动模型。它先是调用get_device(dev)增加dev的引用计数，然后调用device_private_init()分配和初始化dev->p，调用dev_set_name()设置dev名字。然后是准备将dev加入sysfs，先是用get_device(parent)增加对parent的引用计数(无论是直接挂在parent下还是通过一个类层挂在parent下都要增加parent的引用计数），然后调用setup_parent()找到实际要加入的父kobject，通过kobject_add()加入其下。然后是添加属性和属性集合的操作，调用device_create_file()添加uevent属性，调用device_add_attrs()添加device/type/class预定义的属性与属性集合。如果dev有被分配设备号，再用device_create_file()添加dev属性，并用device_create_sys_dev_entry()在/sys/dev下添加相应的软链接，最后调用devtmpfs_create_node()在/dev下创建相应的设备文件。然后调用device_add_class_symlinks()添加dev与class间的软链接，调用bus_add_device()添加dev与bus间的软链接，并将dev挂入bus的设备链表。调用dpm_sysfs_add()增加dev下的power属性集合，调用device_pm_add()将dev加入dpm_list链表。

调用kobject_uevent()发布KOBJ_ADD消息，调用bus_probe_device()为dev寻找合适的驱动。如果有parent节点，把dev->p->knode_parent挂入parent->p->klist_children链表。如果dev有所属的class，将dev->knode_class挂在class->p->class_devices上，并调用可能的类设备接口的add_dev()方法。可能对于直接在bus上的设备来说，自然可以调用bus_probe_device()查找驱动，而不与总线直接接触的设备，则要靠class来发现驱动，这里的class_interface中的add_dev()方法，就是一个绝好的机会。最后会调用put_device(dev)释放在函数开头增加的引用计数。

device_add()要做的事很多，但想想每件事都在情理之中。device是设备驱动模型的基本元素，在class、bus、dev、devices中都有它的身影。device_add()要适应各种类型的设备注册，自然会越来越复杂。可以说文件开头定义的内部函数，差不多都是为了这里服务的。

void device_unregister(struct device *dev)

{

pr_debug(“device: ‘%s’: %s\n”, dev_name(dev), __func__);

device_del(dev);

put_device(dev);

}

有注册自然又注销。device_unregister()就是用于将dev从系统中注销，并释放创建时产生的引用计数。

void device_del(struct device *dev)

{

struct device *parent = dev->parent;

struct class_interface *class_intf;

/* Notify clients of device removal. This call must come

* before dpm_sysfs_remove().

if (dev->bus)

blocking_notifier_call_chain(&dev->bus->p->bus_notifier,

BUS_NOTIFY_DEL_DEVICE, dev);

device_pm_remove(dev);

dpm_sysfs_remove(dev);

if (parent)

klist_del(&dev->p->knode_parent);

if (MAJOR(dev->devt)) {

devtmpfs_delete_node(dev);

device_remove_sys_dev_entry(dev);

device_remove_file(dev, &devt_attr);

}

if (dev->class) {

device_remove_class_symlinks(dev);

mutex_lock(&dev->class->p->class_mutex);

/* notify any interfaces that the device is now gone */

list_for_each_entry(class_intf,

&dev->class->p->class_interfaces, node)

if (class_intf->remove_dev)

class_intf->remove_dev(dev, class_intf);

/* remove the device from the class list */

klist_del(&dev->knode_class);

mutex_unlock(&dev->class->p->class_mutex);

}

device_remove_file(dev, &uevent_attr);

device_remove_attrs(dev);

bus_remove_device(dev);

* Some platform devices are driven without driver attached

* and managed resources may have been acquired. Make sure

* all resources are released.

devres_release_all(dev);

/* Notify the platform of the removal, in case they

* need to do anything…

if (platform_notify_remove)

platform_notify_remove(dev);

kobject_uevent(&dev->kobj, KOBJ_REMOVE);

cleanup_device_parent(dev);

kobject_del(&dev->kobj);

put_device(parent);

}

device_del()是与device_add()相对的函数，进行实际的将dev从系统中脱离的工作。这其中既有将dev从设备驱动模型各种链表中脱离的工作，又有将dev从sysfs的各个角落删除的工作。大致流程与dev_add()相对，就不一一介绍。

爆发结束，下面来看一些比较轻松的函数。

* device_get_devnode – path of device node file

* @dev: device

* @mode: returned file access mode

* @tmp: possibly allocated string

* Return the relative path of a possible device node.

* Non-default names may need to allocate a memory to compose

* a name. This memory is returned in tmp and needs to be

* freed by the caller.

const char *device_get_devnode(struct device *dev,

mode_t *mode, const char **tmp)

{

char *s;

*tmp = NULL;

/* the device type may provide a specific name */

if (dev->type && dev->type->devnode)

*tmp = dev->type->devnode(dev, mode);

if (*tmp)

return *tmp;

/* the class may provide a specific name */

if (dev->class && dev->class->devnode)

*tmp = dev->class->devnode(dev, mode);

if (*tmp)

return *tmp;

/* return name without allocation, tmp == NULL */

if (strchr(dev_name(dev), ‘!’) == NULL)

return dev_name(dev);

/* replace ‘!’ in the name with ‘/’ */

*tmp = kstrdup(dev_name(dev), GFP_KERNEL);

if (!*tmp)

return NULL;

while ((s = strchr(*tmp, ‘!’)))

s[0] = ‘/’;

return *tmp;

}

device_get_devnode()返回设备的路径名。不过似乎可以由device_type或者class定义一些独特的返回名称。

static struct device *next_device(struct klist_iter *i)

{

struct klist_node *n = klist_next(i);

struct device *dev = NULL;

struct device_private *p;

if (n) {

p = to_device_private_parent(n);

dev = p->device;

}

return dev;

}

int device_for_each_child(struct device *parent, void *data,

int (*fn)(struct device *dev, void *data))

{

struct klist_iter i;

struct device *child;

int error = 0;

if (!parent->p)

return 0;

klist_iter_init(&parent->p->klist_children, &i);

while ((child = next_device(&i)) && !error)

error = fn(child, data);

klist_iter_exit(&i);

return error;

}

struct device *device_find_child(struct device *parent, void *data,

int (*match)(struct device *dev, void *data))

{

struct klist_iter i;

struct device *child;

if (!parent)

return NULL;

klist_iter_init(&parent->p->klist_children, &i);

while ((child = next_device(&i)))

if (match(child, data) && get_device(child))

break;

klist_iter_exit(&i);

return child;

}

device_for_each_child()对dev下的每个子device，都调用一遍特定的处理函数。

device_find_child()则是查找dev下特点的子device，查找使用特定的match函数。

这两个遍历过程都使用了klist特有的遍历函数，支持遍历过程中的节点删除等功能。next_device()则是为了遍历方便封装的一个内部函数。

下面本该是root_device注册相关的代码。但经过检查，linux内核中使用到的root_device很少见，而且在sysfs中也未能找到一个实际的例子。所以root_device即使还未被弃用，也并非主流，我们将其跳过。

与kobject和kset类似，device也为我们提供了快速device创建方法，下面就看看吧。

static void device_create_release(struct device *dev)

{

pr_debug(“device: ‘%s’: %s\n”, dev_name(dev), __func__);

kfree(dev);

}

struct device *device_create_vargs(struct class *class, struct device *parent,

dev_t devt, void *drvdata, const char *fmt,

va_list args)

{

struct device *dev = NULL;

int retval = -ENODEV;

if (class == NULL || IS_ERR(class))

goto error;

dev = kzalloc(sizeof(*dev), GFP_KERNEL);

if (!dev) {

retval = -ENOMEM;

goto error;

}

dev->devt = devt;

dev->class = class;

dev->parent = parent;

dev->release = device_create_release;

dev_set_drvdata(dev, drvdata);

retval = kobject_set_name_vargs(&dev->kobj, fmt, args);

if (retval)

goto error;

retval = device_register(dev);

if (retval)

goto error;

return dev;

error:

put_device(dev);

return ERR_PTR(retval);

}

struct device *device_create(struct class *class, struct device *parent,

dev_t devt, void *drvdata, const char *fmt, …)

{

va_list vargs;

struct device *dev;

va_start(vargs, fmt);

dev = device_create_vargs(class, parent, devt, drvdata, fmt, vargs);

va_end(vargs);

return dev;

}

这里的device_create()提供了一个快速的dev创建注册方法。只是中间没有提供设置device_type的方法，或许是这样的device已经够特立独行了，不需要搞出一类来。

static int __match_devt(struct device *dev, void *data)

{

dev_t *devt = data;

return dev->devt == *devt;

}

void device_destroy(struct class *class, dev_t devt)

{

struct device *dev;

dev = class_find_device(class, NULL, &devt, __match_devt);

if (dev) {

put_device(dev);

device_unregister(dev);

}

}

device_destroy()就是与device_create()相对的注销函数。至于这里为什么会多一个put_device(dev)，也很简单，因为在class_find_device()找到dev时，调用了get_device()。

struct device *class_find_device(struct class *class, struct device *start,

void *data,

int (*match)(struct device *, void *))

{

struct class_dev_iter iter;

struct device *dev;

if (!class)

return NULL;

if (!class->p) {

WARN(1, “%s called for class ‘%s’ before it was initialized”,

__func__, class->name);

return NULL;

}

class_dev_iter_init(&iter, class, start, NULL);

while ((dev = class_dev_iter_next(&iter))) {

if (match(dev, data)) {

get_device(dev);

break;

}

}

class_dev_iter_exit(&iter);

return dev;

}

class_find_device()本来是class.c中的内容，其实现也于之前将的遍历dev->p->klist_children类似，无非是在klist提供的遍历方法上加以封装。但我们这里列出class_find_device()的实现与使用它的device_destroy()，却是为了更好地分析这个调用流程中dev是如何被保护的。它实际上是经历了三个保护手段：首先在class_dev_iter_next()->klist_next()中，是受到struct klist中 spinlock_t k_lock保护的。在找到下一点并解锁之前，就增加了struct klist_node中的struct kref n_ref引用计数。在当前的next()调用完，到下一个next()调用之前，都是受这个增加的引用计数保护的。再看class_find_device()中，使用get_device(dev)增加了dev本身的引用计数保护(当然也要追溯到kobj->kref中)，这是第三种保护。知道device_destroy()中主动调用put_device(dev)才去除了这种保护。

本来对dev的保护，应该完全是由dev中的引用计数完成的。但实际上这种保护很多时候是间接完成的。例如这里的klist中的自旋锁，klist_node中的引用计数，都不过是为了保持class的设备链表中对dev的引用计数不消失，这是一种间接保护的手段，保证了这中间即使外界主动释放class设备链表对dev的引用计数，dev仍然不会被实际注销。这种曲折的联系，才真正发挥了引用计数的作用，构成设备驱动模型独特的魅力。

int device_rename(struct device *dev, char *new_name)

{

char *old_device_name = NULL;

int error;

dev = get_device(dev);

if (!dev)

return -EINVAL;

pr_debug(“device: ‘%s’: %s: renaming to ‘%s’\n”, dev_name(dev),

__func__, new_name);

old_device_name = kstrdup(dev_name(dev), GFP_KERNEL);

if (!old_device_name) {

error = -ENOMEM;

goto out;

}

error = kobject_rename(&dev->kobj, new_name);

if (error)

goto out;

if (dev->class) {

error = sysfs_create_link_nowarn(&dev->class->p->class_subsys.kobj,

&dev->kobj, dev_name(dev));

if (error)

goto out;

sysfs_remove_link(&dev->class->p->class_subsys.kobj,

old_device_name);

}

out:

put_device(dev);

kfree(old_device_name);

return error;

}

device_rename()是供设备注册后改变名称用的，除了改变/sys/devices下地名称，还改变了/sys/class下地软链接名称。前者很自然，但后者却很难想到。即使简单的地方，经过重重调试，我们也会惊讶于linux的心细如发。

static int device_move_class_links(struct device *dev,

struct device *old_parent,

struct device *new_parent)

{

int error = 0;

if (old_parent)

sysfs_remove_link(&dev->kobj, “device”);

if (new_parent)

error = sysfs_create_link(&dev->kobj, &new_parent->kobj,

“device”);

return error;

#endif

}

device_move_class_links()只是一个内部函数，后面还有操纵它的那只手。这里的device_move_class_links显得很名不副实，并没用操作class中软链接的举动。这很正常，因为在sysfs中软链接是针对kobject来说的，所以即使位置变掉了，软链接还是很很准确地定位。

* device_move – moves a device to a new parent

* @dev: the pointer to the struct device to be moved

* @new_parent: the new parent of the device (can by NULL)

* @dpm_order: how to reorder the dpm_list

int device_move(struct device *dev, struct device *new_parent,

enum dpm_order dpm_order)

{

int error;

struct device *old_parent;

struct kobject *new_parent_kobj;

dev = get_device(dev);

if (!dev)

return -EINVAL;

device_pm_lock();

new_parent = get_device(new_parent);

new_parent_kobj = get_device_parent(dev, new_parent);

pr_debug(“device: ‘%s’: %s: moving to ‘%s’\n”, dev_name(dev),

__func__, new_parent ? dev_name(new_parent) : “<NULL>”);

error = kobject_move(&dev->kobj, new_parent_kobj);

if (error) {

cleanup_glue_dir(dev, new_parent_kobj);

put_device(new_parent);

goto out;

}

old_parent = dev->parent;

dev->parent = new_parent;

if (old_parent)

klist_remove(&dev->p->knode_parent);

if (new_parent) {

klist_add_tail(&dev->p->knode_parent,

&new_parent->p->klist_children);

set_dev_node(dev, dev_to_node(new_parent));

}

if (!dev->class)

goto out_put;

error = device_move_class_links(dev, old_parent, new_parent);

if (error) {

/* We ignore errors on cleanup since we’re hosed anyway… */

device_move_class_links(dev, new_parent, old_parent);

if (!kobject_move(&dev->kobj, &old_parent->kobj)) {

if (new_parent)

klist_remove(&dev->p->knode_parent);

dev->parent = old_parent;

if (old_parent) {

klist_add_tail(&dev->p->knode_parent,

&old_parent->p->klist_children);

set_dev_node(dev, dev_to_node(old_parent));

}

}

cleanup_glue_dir(dev, new_parent_kobj);

put_device(new_parent);

goto out;

}

switch (dpm_order) {

case DPM_ORDER_NONE:

break;

case DPM_ORDER_DEV_AFTER_PARENT:

device_pm_move_after(dev, new_parent);

break;

case DPM_ORDER_PARENT_BEFORE_DEV:

device_pm_move_before(new_parent, dev);

break;

case DPM_ORDER_DEV_LAST:

device_pm_move_last(dev);

break;

}

out_put:

put_device(old_parent);

out:

device_pm_unlock();

put_device(dev);

return error;

}

device_move()就是将dev移到一个新的parent下。但也有可能这个parent是空的。大部分操作围绕在引用计数上，get_device()，put_device()。而且换了新的parent，到底要加到sysfs中哪个目录下，还要再调用get_device_parent()研究一下。主要的操作就是kobject_move()和device_move_class_links()。因为在sysfs中软链接是针对kobject来说的，所以即使位置变掉了，软链接还是很很准确地定位，所以在/sys/dev、/sys/bus、/sys/class中的软链接都不用变，这实在是sysfs的一大优势。除此之外，device_move()还涉及到电源管理的问题，device移动影响到dev在dpm_list上的位置，我们对此不了解，先忽略之。

void device_shutdown(void)

{

struct device *dev, *devn;

list_for_each_entry_safe_reverse(dev, devn, &devices_kset->list,

kobj.entry) {

if (dev->bus && dev->bus->shutdown) {

dev_dbg(dev, “shutdown\n”);

dev->bus->shutdown(dev);

} else if (dev->driver && dev->driver->shutdown) {

dev_dbg(dev, “shutdown\n”);

dev->driver->shutdown(dev);

}

}

kobject_put(sysfs_dev_char_kobj);

kobject_put(sysfs_dev_block_kobj);

kobject_put(dev_kobj);

async_synchronize_full();

}

这个device_shutdown()是在系统关闭时才调用的。它动用了很少使用的devices_kset，从而可以遍历到每个注册到sysfs上的设备，调用相应的总线或驱动定义的shutdown()函数。提起这个，还是在device_initialize()中将dev->kobj->kset统一设为devices_kset的。原来设备虽然有不同的parent，但kset还是一样的。这样我们就能理解/sys/devices下的顶层设备目录是怎么来的，因为没用parent，就在调用kobject_add()时将kset->kobj当成了parent，所以会直接挂在顶层目录下。这样的目录大致有pci0000:00、virtual等等。

看完了core.c，我有种明白机器人也是由零件组成的的感觉。linux设备驱动模型的大门已经打开了四分之一。��着分析的深入，我们大概也会越来越明白linux的良苦用心。

上节我们分析设备驱动模型中的device，主要是drivers/base/core.c，可以说是代码量最大的一个文件。本节要分析的驱动driver，就要相对简单很多。原因也很简单，对于driver，我们能定义的公共部分实在不多，能再sysfs中表达的也很少。本节的分析将围绕drivers/base/driver.c，但头文件仍然是include/linux/device.h和drivers/base/base.h。

先让我们来看看driver的结构。

struct device_driver {

const char *name;

struct bus_type *bus;

struct module *owner;

const char *mod_name; /* used for built-in modules */

bool suppress_bind_attrs; /* disables bind/unbind via sysfs */

int (*probe) (struct device *dev);

int (*remove) (struct device *dev);

void (*shutdown) (struct device *dev);

int (*suspend) (struct device *dev, pm_message_t state);

int (*resume) (struct device *dev);

const struct attribute_group **groups;

const struct dev_pm_ops *pm;

struct driver_private *p;

};

struct device_driver就是模型定义的通用驱动结构。name是驱动名称，但这个name也只是在静态定义的初始名称，实际使用的名称还是由kobject中保管的。bus执行驱动所在的总线，owner是驱动所在的模块，还有一个所在模块名称mod_name，suppress_bind_attrs定义是否允许驱动通过sysfs决定挂载还是卸载设备。下面是一系列函数指针，probe是在驱动刚与设备挂接时调用的，remove是在设备卸载时调用的，shutdown是在设备关闭时调用的(说实话我现在还不知道remove和shutdown的区别），suspend是设备休眠时调用的，resume是设备恢复时调用的。group是属性集合，pm是电源管理的函数集合，p是指向driver_private的指针。

struct driver_private {

struct kobject kobj;

struct klist klist_devices;

struct klist_node knode_bus;

struct module_kobject *mkobj;

struct device_driver *driver;

};

#define to_driver(obj) container_of(obj, struct driver_private, kobj)

与device类似，device_driver把与其它组件联系的大部分结构变量移到struct driver_private中来。首先是kobj，在sysfs中代表driver目录本身。klist_devices是驱动下的设备链表，knode_bus是要挂载在总线的驱动链表上的节点。mkobj是driver与相关module的联系，之前在device_driver结构中已经有指向module的指针，但这还不够，在/sys下你能发现一个module目录，所以驱动所属的模块在sysfs中也有显示，具体留到代码中再看。driver指针自然是从driver_private指回struct device_driver的。

struct driver_attribute {

struct attribute attr;

ssize_t (*show)(struct device_driver *driver, char *buf);

ssize_t (*store)(struct device_driver *driver, const char *buf,

size_t count);

};

#define DRIVER_ATTR(_name, _mode, _show, _store) \

struct driver_attribute driver_attr_##_name = \

__ATTR(_name, _mode, _show, _store)

除了以上两个结构，还有struct driver_attribute。driver_attribute是driver对struct attribute的封装，添加了两个特用于device_driver的读写函数。这种封装看似简单重复，工作量很小，但在使用时却会造成巨大的便利。

好，结构介绍完毕，下面看driver.c中的实现。

static struct device *next_device(struct klist_iter *i)

{

struct klist_node *n = klist_next(i);

struct device *dev = NULL;

struct device_private *dev_prv;

if (n) {

dev_prv = to_device_private_driver(n);

dev = dev_prv->device;

}

return dev;

}

int driver_for_each_device(struct device_driver *drv, struct device *start,

void *data, int (*fn)(struct device *, void *))

{

struct klist_iter i;

struct device *dev;

int error = 0;

if (!drv)

return -EINVAL;

klist_iter_init_node(&drv->p->klist_devices, &i,

start ? &start->p->knode_driver : NULL);

while ((dev = next_device(&i)) && !error)

error = fn(dev, data);

klist_iter_exit(&i);

return error;

}

struct device *driver_find_device(struct device_driver *drv,

struct device *start, void *data,

int (*match)(struct device *dev, void *data))

{

struct klist_iter i;

struct device *dev;

if (!drv)

return NULL;

klist_iter_init_node(&drv->p->klist_devices, &i,

(start ? &start->p->knode_driver : NULL));

while ((dev = next_device(&i)))

if (match(dev, data) && get_device(dev))

break;

klist_iter_exit(&i);

return dev;

}

driver_for_each_device()是对drv的设备链表中的每个设备调用一次指定函数。

driver_find_device()是在drv的设备链表中寻找一个设备，寻找使用指定的匹配函数。

这两个函数都不陌生，在之前分析device的core.c中已经见到与它们很类似的函数，只不过那里是遍历设备的子设备链表，这里是遍历驱动的设备链表。next_device()同样是辅助用的内部函数。

int driver_create_file(struct device_driver *drv,

struct driver_attribute *attr)

{

int error;

if (drv)

error = sysfs_create_file(&drv->p->kobj, &attr->attr);

else

error = -EINVAL;

return error;

}

void driver_remove_file(struct device_driver *drv,

struct driver_attribute *attr)

{

if (drv)

sysfs_remove_file(&drv->p->kobj, &attr->attr);

}

driver_create_file()创建drv下的属性文件，调用sysfs_create_file()实现。

driver_remove_file()删除drv下的属性文件，调用sysfs_remove_file()实现。

static int driver_add_groups(struct device_driver *drv,

const struct attribute_group **groups)

{

int error = 0;

int i;

if (groups) {

for (i = 0; groups[i]; i++) {

error = sysfs_create_group(&drv->p->kobj, groups[i]);

if (error) {

while (–i >= 0)

sysfs_remove_group(&drv->p->kobj,

groups[i]);

break;

}

}

}

return error;

}

static void driver_remove_groups(struct device_driver *drv,

const struct attribute_group **groups)

{

int i;

if (groups)

for (i = 0; groups[i]; i++)

sysfs_remove_group(&drv->p->kobj, groups[i]);

}

driver_add_groups()在drv目录下添加属性集合，调用sysfs_create_groups()实现。

driver_remove_groups()在drv目录下删除属性集合，调用sysfs_remove_groups()实现。

发现两点问题：第一，是不是觉得driver_add_groups()不太合适，最好改为driver_create_groups()才搭调。但不只是driver用driver_add_groups()，device也使用device_add_groups()，不知一处这样做。第二��有没有发现driver_create_file()是外部函数，driver_add_groups()就是内部函数，也就是说driver只对外提供添加属性的接口，却不提供添加属性集合的接口。理由吗？在struct device_driver()已经专门定义了一个groups变量来添加属性集合，后面就不易再重复提供接口，而且创建属性集合需要的操作远比创建属性费时。在device中也是这样做的。

另外，driver中只提供管理属性文件的方法，却不提供管理二进制属性文件的方法，这是因为驱动本身没有这种需求，只有部分设备才要求二进制文件表示。

struct device_driver *get_driver(struct device_driver *drv)

{

if (drv) {

struct driver_private *priv;

struct kobject *kobj;

kobj = kobject_get(&drv->p->kobj);

priv = to_driver(kobj);

return priv->driver;

}

return NULL;

}

void put_driver(struct device_driver *drv)

{

kobject_put(&drv->p->kobj);

}

get_driver()增加drv的引用计数，put_driver()减少drv的引用计数。这都是通过drv->p->kobj来做的。

struct device_driver *driver_find(const char *name, struct bus_type *bus)

{

struct kobject *k = kset_find_obj(bus->p->drivers_kset, name);

struct driver_private *priv;

if (k) {

priv = to_driver(k);

return priv->driver;

}

return NULL;

}

driver_find()从bus的驱动链表中寻找特定名称的driver。

* driver_register – register driver with bus

* @drv: driver to register

* We pass off most of the work to the bus_add_driver() call,

* since most of the things we have to do deal with the bus

* structures.

int driver_register(struct device_driver *drv)

{

int ret;

struct device_driver *other;

BUG_ON(!drv->bus->p);

if ((drv->bus->probe && drv->probe) ||

(drv->bus->remove && drv->remove) ||

(drv->bus->shutdown && drv->shutdown))

printk(KERN_WARNING “Driver ‘%s’ needs updating – please use “

“bus_type methods\n”, drv->name);

other = driver_find(drv->name, drv->bus);

if (other) {

put_driver(other);

printk(KERN_ERR “Error: Driver ‘%s’ is already registered, “

“aborting…\n”, drv->name);

return -EBUSY;

}

ret = bus_add_driver(drv);

if (ret)

return ret;

ret = driver_add_groups(drv, drv->groups);

if (ret)

bus_remove_driver(drv);

return ret;

}

driver_register()将drv注册到系统中。它真是做得难以预料地简单，所有的工作几乎完全是由bus_add_driver()代为完成的。但你要注意，在调用driver_register()前，drv->bus一定要预先设置。device可以不绑定bus，但driver一定要绑定到bus上。

void driver_unregister(struct device_driver *drv)

{

if (!drv || !drv->p) {

WARN(1, “Unexpected driver unregister!\n”);

return;

}

driver_remove_groups(drv, drv->groups);

bus_remove_driver(drv);

}

driver_unregister()将drv从系统中撤销。大部分工作是调用bus_remove_driver()完成的。可以看出bus_add_driver()与bus_remove_driver()相对。driver和bus的联系如此紧密，以至于driver的注册和撤销工作都可以由bus代劳了。我们需要更进一步的分析。

经过调查，我们发现很有一部分driver的代码被移动到了bus.c中。我们本节是以driver为主，所以接下来会尽量在不惊动bus的情况下，分析存在于bus.c中的driver代码。

static ssize_t drv_attr_show(struct kobject *kobj, struct attribute *attr,

char *buf)

{

struct driver_attribute *drv_attr = to_drv_attr(attr);

struct driver_private *drv_priv = to_driver(kobj);

ssize_t ret = -EIO;

if (drv_attr->show)

ret = drv_attr->show(drv_priv->driver, buf);

return ret;

}

static ssize_t drv_attr_store(struct kobject *kobj, struct attribute *attr,

const char *buf, size_t count)

{

struct driver_attribute *drv_attr = to_drv_attr(attr);

struct driver_private *drv_priv = to_driver(kobj);

ssize_t ret = -EIO;

if (drv_attr->store)

ret = drv_attr->store(drv_priv->driver, buf, count);

return ret;

}

static struct sysfs_ops driver_sysfs_ops = {

.show = drv_attr_show,

.store = drv_attr_store,

};

看到这里，你终于觉得driver开始正常了，它还要定义sysfs读写时操作的函数。

static void driver_release(struct kobject *kobj)

{

struct driver_private *drv_priv = to_driver(kobj);

pr_debug(“driver: ‘%s’: %s\n”, kobject_name(kobj), __func__);

kfree(drv_priv);

}

static struct kobj_type driver_ktype = {

.sysfs_ops = &driver_sysfs_ops,

.release = driver_release,

};

与device的释放函数device_release不同，driver_release没有提供外界代码运行的机会，只是简单地释放drv_priv函数。

/* Manually detach a device from its associated driver. */

static ssize_t driver_unbind(struct device_driver *drv,

const char *buf, size_t count)

{

struct bus_type *bus = bus_get(drv->bus);

struct device *dev;

int err = -ENODEV;

dev = bus_find_device_by_name(bus, NULL, buf);

if (dev && dev->driver == drv) {

if (dev->parent) /* Needed for USB */

down(&dev->parent->sem);

device_release_driver(dev);

if (dev->parent)

up(&dev->parent->sem);

err = count;

}

put_device(dev);

bus_put(bus);

return err;

}

static DRIVER_ATTR(unbind, S_IWUSR, NULL, driver_unbind);

* Manually attach a device to a driver.

* Note: the driver must want to bind to the device,

* it is not possible to override the driver’s id table.

static ssize_t driver_bind(struct device_driver *drv,

const char *buf, size_t count)

{

struct bus_type *bus = bus_get(drv->bus);

struct device *dev;

int err = -ENODEV;

dev = bus_find_device_by_name(bus, NULL, buf);

if (dev && dev->driver == NULL && driver_match_device(drv, dev)) {

if (dev->parent) /* Needed for USB */

down(&dev->parent->sem);

down(&dev->sem);

err = driver_probe_device(drv, dev);

up(&dev->sem);

if (dev->parent)

up(&dev->parent->sem);

if (err > 0) {

/* success */

err = count;

} else if (err == 0) {

/* driver didn’t accept device */

err = -ENODEV;

}

}

put_device(dev);

bus_put(bus);

return err;

}

static DRIVER_ATTR(bind, S_IWUSR, NULL, driver_bind);

上面描述了driver下两个只写的属性文件，unbind和bind。应该是提供用户空间命令是否将设备与驱动挂接的接口。

static int driver_add_attrs(struct bus_type *bus, struct device_driver *drv)

{

int error = 0;

int i;

if (bus->drv_attrs) {

for (i = 0; attr_name(bus->drv_attrs[i]); i++) {

error = driver_create_file(drv, &bus->drv_attrs[i]);

if (error)

goto err;

}

}

done:

return error;

err:

while (–i >= 0)

driver_remove_file(drv, &bus->drv_attrs[i]);

goto done;

}

static void driver_remove_attrs(struct bus_type *bus,

struct device_driver *drv)

{

int i;

if (bus->drv_attrs) {

for (i = 0; attr_name(bus->drv_attrs[i]); i++)

driver_remove_file(drv, &bus->drv_attrs[i]);

}

}

driver_add_attrs()向drv目录下添加属性，只是这些属性都是在bus中定义的drv_attrs[]。

driver_remove_attrs()从drv目录中删除相应的bus->drv_attrs[]。

static int __must_check add_bind_files(struct device_driver *drv)

{

int ret;

ret = driver_create_file(drv, &driver_attr_unbind);

if (ret == 0) {

ret = driver_create_file(drv, &driver_attr_bind);

if (ret)

driver_remove_file(drv, &driver_attr_unbind);

}

return ret;

}

static void remove_bind_files(struct device_driver *drv)

{

driver_remove_file(drv, &driver_attr_bind);

driver_remove_file(drv, &driver_attr_unbind);

}

add_bind_files()在drv目录下增加bind和unbind属性。

remove_bind_files()从drv目录下删除bind和unbind属性。

static ssize_t driver_uevent_store(struct device_driver *drv,

const char *buf, size_t count)

{

enum kobject_action action;

if (kobject_action_type(buf, count, &action) == 0)

kobject_uevent(&drv->p->kobj, action);

return count;

}

static DRIVER_ATTR(uevent, S_IWUSR, NULL, driver_uevent_store);

这是drv目录下地uevent属性文件，提供了从drv发送uevent的方法。

* bus_add_driver – Add a driver to the bus.

* @drv: driver.

int bus_add_driver(struct device_driver *drv)

{

struct bus_type *bus;

struct driver_private *priv;

int error = 0;

bus = bus_get(drv->bus);

if (!bus)

return -EINVAL;

pr_debug(“bus: ‘%s’: add driver %s\n”, bus->name, drv->name);

priv = kzalloc(sizeof(*priv), GFP_KERNEL);

if (!priv) {

error = -ENOMEM;

goto out_put_bus;

}

klist_init(&priv->klist_devices, NULL, NULL);

priv->driver = drv;

drv->p = priv;

priv->kobj.kset = bus->p->drivers_kset;

error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL,

“%s”, drv->name);

if (error)

goto out_unregister;

if (drv->bus->p->drivers_autoprobe) {

error = driver_attach(drv);

if (error)

goto out_unregister;

}

klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers);

module_add_driver(drv->owner, drv);

error = driver_create_file(drv, &driver_attr_uevent);

if (error) {

printk(KERN_ERR “%s: uevent attr (%s) failed\n”,

__func__, drv->name);

}

error = driver_add_attrs(bus, drv);

if (error) {

/* How the hell do we get out of this pickle? Give up */

printk(KERN_ERR “%s: driver_add_attrs(%s) failed\n”,

__func__, drv->name);

}

if (!drv->suppress_bind_attrs) {

error = add_bind_files(drv);

if (error) {

/* Ditto */

printk(KERN_ERR “%s: add_bind_files(%s) failed\n”,

__func__, drv->name);

}

}

kobject_uevent(&priv->kobj, KOBJ_ADD);

return 0;

out_unregister:

kfree(drv->p);

drv->p = NULL;

kobject_put(&priv->kobj);

out_put_bus:

bus_put(bus);

return error;

}

bus_add_driver()看似是把drv与bus联系起来，其实是完成driver加入系统的大部分操作。

首先调用bus_get(drv->bus)增加对bus的引用。

分配并初始化drv->p，即driver_private结构。

调用kobject_init_and_add()将drv加入sysfs，之前只是设置了priv->obj.kset为bus->p->drivers_kset，所以drv目录会出现在bus目录的drivers子目录中。如果总线允许自动probe，就会调用driver_attach()将驱动和总线上的设备进行匹配，这个过程先略过。

然后调用klist_add_tail()将drv挂入总线的驱动链表。

调用module_add_driver()创建driver相关的模块在sysfs中的表示。后面专门描述。

调用driver_create_file()在drv目录下创建uevent属性文件。

调用driver_add_attrs()在drv目录下添加bus->driver_attrs[]中定义的属性。

如果drv->suppress_bind_attrs为零，即允许用户空间决定驱动何时链接和卸载设备，则调用add_bind_files()添加bind和unbind属性文件。

调用kobject_uevent()向用户空间发布KOBJ_ADD消息。

从bus_add_driver()的处理过程来看，driver只在bus的drivers目录下出现，没什么软链接，需要的属性也不多。

* bus_remove_driver – delete driver from bus’s knowledge.

* @drv: driver.

* Detach the driver from the devices it controls, and remove

* it from its bus’s list of drivers. Finally, we drop the reference

* to the bus we took in bus_add_driver().

void bus_remove_driver(struct device_driver *drv)

{

if (!drv->bus)

return;

if (!drv->suppress_bind_attrs)

remove_bind_files(drv);

driver_remove_attrs(drv->bus, drv);

driver_remove_file(drv, &driver_attr_uevent);

klist_remove(&drv->p->knode_bus);

pr_debug(“bus: ‘%s’: remove driver %s\n”, drv->bus->name, drv->name);

driver_detach(drv);

module_remove_driver(drv);

kobject_put(&drv->p->kobj);

bus_put(drv->bus);

}

bus_remove_driver()将drv从系统中撤销，与bus_add_driver()相对应。

driver真正精彩的地方在于probe函数，对设备的操作，对用户空间提供的接口，可惜这些都是特定的。这里只能将driver与bus联系起来，并在以后与device联系起来。

不过不必失望，下面我们分析下drivers/base/module.c，它显示了与驱动有关的module，在sysfs中的表现情况。

首先介绍使用到的结构。应该说module.c的代码实现很简单，但使用到的结构不简单。

struct module_attribute {

struct attribute attr;

ssize_t (*show)(struct module_attribute *, struct module *, char *);

ssize_t (*store)(struct module_attribute *, struct module *,

const char *, size_t count);

void (*setup)(struct module *, const char *);

int (*test)(struct module *);

void (*free)(struct module *);

};

struct param_attribute

{

struct module_attribute mattr;

struct kernel_param *param;

};

struct module_param_attrs

{

unsigned int num;

struct attribute_group grp;

struct param_attribute attrs[0];

};

struct module_kobject

{

struct kobject kobj;

struct module *mod;

struct kobject *drivers_dir;

struct module_param_attrs *mp;

};

可以看到module_attribute结构除了包含struct attribute，还多增加了好几条函数指针。而这只是最简单的，struct param_attribute除了包含module_attribute，还有一个指向kernel_param的指针param。这个kernel_param就太复杂了，是外界向module提供参数用的窗口，这里忽略。后面还有struct module_param_attrs和struct module_kobject。

static char *make_driver_name(struct device_driver *drv)

{

char *driver_name;

driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2,

GFP_KERNEL);

if (!driver_name)

return NULL;

sprintf(driver_name, “%s:%s”, drv->bus->name, drv->name);

return driver_name;

}

make_driver_name()将drv的名字和drv->bus的名字合起来，不过这是一个内部函数，具体使用还要看后面。

static void module_create_drivers_dir(struct module_kobject *mk)

{

if (!mk || mk->drivers_dir)

return;

mk->drivers_dir = kobject_create_and_add(“drivers”, &mk->kobj);

}

module_create_drivers_dir()在mk所在的目录下创建一个drivers的目录。不过因为是使用kobject_create_and_add()，所以这个kobject使用默认的dynamic_kobj_ktype。

void module_add_driver(struct module *mod, struct device_driver *drv)

{

char *driver_name;

int no_warn;

struct module_kobject *mk = NULL;

if (!drv)

return;

if (mod)

mk = &mod->mkobj;

else if (drv->mod_name) {

struct kobject *mkobj;

/* Lookup built-in module entry in /sys/modules */

mkobj = kset_find_obj(module_kset, drv->mod_name);

if (mkobj) {

mk = container_of(mkobj, struct module_kobject, kobj);

/* remember our module structure */

drv->p->mkobj = mk;

/* kset_find_obj took a reference */

kobject_put(mkobj);

}

}

if (!mk)

return;

/* Don’t check return codes; these calls are idempotent */

no_warn = sysfs_create_link(&drv->p->kobj, &mk->kobj, “module”);

driver_name = make_driver_name(drv);

if (driver_name) {

module_create_drivers_dir(mk);

no_warn = sysfs_create_link(mk->drivers_dir, &drv->p->kobj,

driver_name);

kfree(driver_name);

}

}

module_add_drivers()在module下添加与driver的联系。

开始调用kset_find_obj()从module_kset下寻找drv所属的module对应的kobj。说明每个module在加载时都会在/sys/module中创建一个kobject目录。这里找到后只是将其赋给drv->p->kmobj，并调用kobject_put()释放找到时加上的引用计数。至于为什么driver不保留对module的引用计数，或许是不需要，或许是已经存在了。

接下来调用sysfs_create_link()在驱动目录中添加指向module目录的软链接，名称就是module。

调用module_create_drivers_dir()在module目录下建立drivers子目录。

调用sysfs_create_link()在drivers子目录下建立指向驱动目录的软链接，名称使用make_driver_name()的返回结果。

void module_remove_driver(struct device_driver *drv)

{

struct module_kobject *mk = NULL;

char *driver_name;

if (!drv)

return;

sysfs_remove_link(&drv->p->kobj, “module”);

if (drv->owner)

mk = &drv->owner->mkobj;

else if (drv->p->mkobj)

mk = drv->p->mkobj;

if (mk && mk->drivers_dir) {

driver_name = make_driver_name(drv);

if (driver_name) {

sysfs_remove_link(mk->drivers_dir, driver_name);

kfree(driver_name);

}

}

}

module_remove_driver()消除driver与相应module之间的软链接关系。

对于module，应该是另一个议题了，这里只是简单涉及，下节我们将涉及到总线bus，并深入分析device和driver的关系。

前面我们分析了设备驱动模型中的device和driver，device和driver本来是不相关的东西，只因为bus的存在，才被联系到了一起。本节就来看看设备驱动模型中起枢纽作用的bus。本节的头文件在include/linux/device.h和drivers/base/base.h，实现代码主要在bus.c中。因为在bus中有很多代码时为了device找到driver或者driver找到device而定义的，本节先尽量忽略这部分，专注于bus的注册和注销，属性定义等内容。剩下的留到讨论device和driver关系时在分析。

先来看看bus的数据结构。

struct bus_type {

const char *name;

struct bus_attribute *bus_attrs;

struct device_attribute *dev_attrs;

struct driver_attribute *drv_attrs;

int (*match)(struct device *dev, struct device_driver *drv);

int (*uevent)(struct device *dev, struct kobj_uevent_env *env);

int (*probe)(struct device *dev);

int (*remove)(struct device *dev);

void (*shutdown)(struct device *dev);

int (*suspend)(struct device *dev, pm_message_t state);

int (*resume)(struct device *dev);

const struct dev_pm_ops *pm;

struct bus_type_private *p;

};

struct bus_type是bus的通用数据结构。

name是bus的名称，注意到这里也是const char类型的，在sysfs中使用的还是kobj中动态创建的名称，这里的name只是初始名。

bus_attrs是bus为自己定义的一系列属性，dev_attrs是bus为旗下的device定义的一系列属性，drv_attrs是bus为旗下的driver定义的一系列属性。其中dev_attrs在bus_add_device()->device_add_attrs()中被加入dev目录下，drv_attrs在bus_add_driver()->driver_add_attrs()中被加入driver目录下。

match函数匹配总线中的dev和driver，返回值为1代表匹配成功，为0则失败。

uevent函数用于总线对uevent的环境变量添加，但在总线下设备的dev_uevent处理函数也有对它的调用。

probe函数是总线在匹配成功时调用的函数，bus->probe和drv->probe中只会有一个起效，同时存在时使用bus->probe。

remove函数在总线上设备或者驱动要删除时调用，bus->remove和drv->remove中同样只会有一个起效。

shutdown函数在所有设备都关闭时调用，即在core.c中的device_shutdown()函数中调用，bus->shutdown和drv->shutdown同样只会有一个起效。

suspend函数是在总线上设备休眠时调用。

resume函数是在总线上设备恢复时调用。

pm是struct dev_pm_ops类型，其中定义了一系列电源管理的函数。

p是指向bus_type_private的指针，其中定义了将bus同其它组件联系起来的变量。

struct bus_type_private {

struct kset subsys;

struct kset *drivers_kset;

struct kset *devices_kset;

struct klist klist_devices;

struct klist klist_drivers;

struct blocking_notifier_head bus_notifier;

unsigned int drivers_autoprobe:1;

struct bus_type *bus;

};

#define to_bus(obj) container_of(obj, struct bus_type_private, subsys.kobj)

struct bus_type_private是将bus同device、driver、sysfs联系起来的结构。

subsys是kset类型，代表bus在sysfs中的类型。

drivers_kset代表bus目录下的drivers子目录。
devices_kset代表bus目录下地devices子目录。

klist_devices是bus的设备链表，klist_drivers是bus的驱动链表。

bus_notifier用于在总线上内容发送变化时调用特定的函数，这里略过。

driver_autoprobe标志定义是否允许device和driver自动匹配，如果允许会在device或者driver注册时就进行匹配工作。

bus指针指向struct bus_type类型。

使用struct bus_type_private可以将struct bus_type中的部分细节屏蔽掉，利于外界使用bus_type。struct driver_private和struct device_private都有类似的功能。

struct bus_attribute {

struct attribute attr;

ssize_t (*show)(struct bus_type *bus, char *buf);

ssize_t (*store)(struct bus_type *bus, const char *buf, size_t count);

};

#define BUS_ATTR(_name, _mode, _show, _store) \

struct bus_attribute bus_attr_##_name = __ATTR(_name, _mode, _show, _store)

#define to_bus_attr(_attr) container_of(_attr, struct bus_attribute, attr)

struct bus_attribute是bus对struct attribute类型的封装，更方便总线属性的定义。

static ssize_t bus_attr_show(struct kobject *kobj, struct attribute *attr,

char *buf)

{

struct bus_attribute *bus_attr = to_bus_attr(attr);

struct bus_type_private *bus_priv = to_bus(kobj);

ssize_t ret = 0;

if (bus_attr->show)

ret = bus_attr->show(bus_priv->bus, buf);

return ret;

}

static ssize_t bus_attr_store(struct kobject *kobj, struct attribute *attr,

const char *buf, size_t count)

{

struct bus_attribute *bus_attr = to_bus_attr(attr);

struct bus_type_private *bus_priv = to_bus(kobj);

ssize_t ret = 0;

if (bus_attr->store)

ret = bus_attr->store(bus_priv->bus, buf, count);

return ret;

}

static struct sysfs_ops bus_sysfs_ops = {

.show = bus_attr_show,

.store = bus_attr_store,

};

static struct kobj_type bus_ktype = {

.sysfs_ops = &bus_sysfs_ops,

};

以上应该是我们最熟悉的部分，bus_ktype中定义了bus对应的kset应该使用的kobj_type实例。与此类似，driver使用的是自定义的driver_ktype，device使用的是自定义的device_ktype。只是这里仅仅定义了sysfs_ops，并未定义release函数，不知bus_type_private打算何时释放。

int bus_create_file(struct bus_type *bus, struct bus_attribute *attr)

{

int error;

if (bus_get(bus)) {

error = sysfs_create_file(&bus->p->subsys.kobj, &attr->attr);

bus_put(bus);

} else

error = -EINVAL;

return error;

}

void bus_remove_file(struct bus_type *bus, struct bus_attribute *attr)

{

if (bus_get(bus)) {

sysfs_remove_file(&bus->p->subsys.kobj, &attr->attr);

bus_put(bus);

}

}

bus_create_file()在bus目录下创建属性文件，bus_remove_file()在bus目录下删除属性文件。类似的函数在driver和device中都有见到。

static int bus_uevent_filter(struct kset *kset, struct kobject *kobj)

{

struct kobj_type *ktype = get_ktype(kobj);

if (ktype == &bus_ktype)

return 1;

return 0;

}

static struct kset_uevent_ops bus_uevent_ops = {

.filter = bus_uevent_filter,

};

static struct kset *bus_kset;

可以看到这里定义了一个bus_uevent_ops变量，这是kset对uevent事件处理所用的结构，它会用在bus_kset中。

int __init buses_init(void)

{

bus_kset = kset_create_and_add(“bus”, &bus_uevent_ops, NULL);

if (!bus_kset)

return -ENOMEM;

return 0;

}

在buses_init()中创建了/sys/bus目录，这是一个kset类型，使用了bus_uevent_ops的uevent操作类型。

其实这里的操作不难想象，在devices中我们有一个类似的devices_kset，可以回顾一下。

static struct kset_uevent_ops device_uevent_ops = {

.filter = dev_uevent_filter,

.name = dev_uevent_name,

.uevent = dev_uevent,

};

/* kset to create /sys/devices/ */

struct kset *devices_kset;

int __init devices_init(void)

{

devices_kset = kset_create_and_add(“devices”, &device_uevent_ops, NULL);

}

void device_initialize(struct device *dev)

{

dev->kobj.kset = devices_kset;

}

devices_kset在devices_init()中被创建，使用相应的device_uevent_ops进行uevent处理。而devices_kset又被设为每个device初始化时使用的kset。这就不难想象每个device都是以devices_kset为所属kset的，并使用device_uevent_ops中的处理函数。

只是这里还不知bus_kset会在哪里用到，或许是每个bus所属的kset吧，下面会有答案。

static ssize_t show_drivers_autoprobe(struct bus_type *bus, char *buf)

{

return sprintf(buf, “%d\n”, bus->p->drivers_autoprobe);

}

static ssize_t store_drivers_autoprobe(struct bus_type *bus,

const char *buf, size_t count)

{

if (buf[0] == ‘0’)

bus->p->drivers_autoprobe = 0;

else

bus->p->drivers_autoprobe = 1;

return count;

}

static ssize_t store_drivers_probe(struct bus_type *bus,

const char *buf, size_t count)

{

struct device *dev;

dev = bus_find_device_by_name(bus, NULL, buf);

if (!dev)

return -ENODEV;

if (bus_rescan_devices_helper(dev, NULL) != 0)

return -EINVAL;

return count;

}

static BUS_ATTR(drivers_probe, S_IWUSR, NULL, store_drivers_probe);

static BUS_ATTR(drivers_autoprobe, S_IWUSR | S_IRUGO,

show_drivers_autoprobe, store_drivers_autoprobe);

这里定义了总线下的两个属性，只写得drivers_probe，和可读写的drivers_autoprobe。至于其怎么实现的，我们现在还不关心。

static int add_probe_files(struct bus_type *bus)

{

int retval;

retval = bus_create_file(bus, &bus_attr_drivers_probe);

if (retval)

goto out;

retval = bus_create_file(bus, &bus_attr_drivers_autoprobe);

if (retval)

bus_remove_file(bus, &bus_attr_drivers_probe);

out:

return retval;

}

static void remove_probe_files(struct bus_type *bus)

{

bus_remove_file(bus, &bus_attr_drivers_autoprobe);

bus_remove_file(bus, &bus_attr_drivers_probe);

}

add_probe_files()在bus目录下添加drivers_probe和drivers_autoprobe文件。

remove_probe_files()在bus目录下删除drivers_probe和drivers_autoprobe文件。

这两个函数对bus的probe类型属性进行管理，就像add_bind_files/remove_bind_files对driver的bind类型属性进行管理一样。

static ssize_t bus_uevent_store(struct bus_type *bus,

const char *buf, size_t count)

{

enum kobject_action action;

if (kobject_action_type(buf, count, &action) == 0)

kobject_uevent(&bus->p->subsys.kobj, action);

return count;

}

static BUS_ATTR(uevent, S_IWUSR, NULL, bus_uevent_store);

上面定义了bus的一个属性uevent，用于bus所在的kset节点主动发起uevent消息。

同样地uevent文件在driver目录中也有见到。device目录中也有，不过除了store_uevent之外，还增加了show_uevent的功能。

static struct device *next_device(struct klist_iter *i)

{

struct klist_node *n = klist_next(i);

struct device *dev = NULL;

struct device_private *dev_prv;

if (n) {

dev_prv = to_device_private_bus(n);

dev = dev_prv->device;

}

return dev;

}

int bus_for_each_dev(struct bus_type *bus, struct device *start,

void *data, int (*fn)(struct device *, void *))

{

struct klist_iter i;

struct device *dev;

int error = 0;

if (!bus)

return -EINVAL;

klist_iter_init_node(&bus->p->klist_devices, &i,

(start ? &start->p->knode_bus : NULL));

while ((dev = next_device(&i)) && !error)

error = fn(dev, data);

klist_iter_exit(&i);

return error;

}

struct device *bus_find_device(struct bus_type *bus,

struct device *start, void *data,

int (*match)(struct device *dev, void *data))

{

struct klist_iter i;

struct device *dev;

if (!bus)

return NULL;

klist_iter_init_node(&bus->p->klist_devices, &i,

(start ? &start->p->knode_bus : NULL));

while ((dev = next_device(&i)))

if (match(dev, data) && get_device(dev))

break;

klist_iter_exit(&i);

return dev;

}

bus_for_each_dev()是以bus的设备链表中每个设备为参数，调用指定的处理函数。

bus_find_device()是寻找bus设备链表中的某个设备，使用指定的匹配函数。

这两个函数提供遍历bus的设备链表的方法，类似于drivers_for_each_device/drivers_find_device对driver的设备链表的遍历，device_for_each_child/device_find_child对device的子设备链表的遍历。

static int match_name(struct device *dev, void *data)

{

const char *name = data;

return sysfs_streq(name, dev_name(dev));

}

struct device *bus_find_device_by_name(struct bus_type *bus,

struct device *start, const char *name)

{

return bus_find_device(bus, start, (void *)name, match_name);

}

bus_find_device_by_name()给出了如何使用遍历函数的例子，寻找bus设备链表中指定名称的设备。

static struct device_driver *next_driver(struct klist_iter *i)

{

struct klist_node *n = klist_next(i);

struct driver_private *drv_priv;

if (n) {

drv_priv = container_of(n, struct driver_private, knode_bus);

return drv_priv->driver;

}

return NULL;

}

int bus_for_each_drv(struct bus_type *bus, struct device_driver *start,

void *data, int (*fn)(struct device_driver *, void *))

{

struct klist_iter i;

struct device_driver *drv;

int error = 0;

if (!bus)

return -EINVAL;

klist_iter_init_node(&bus->p->klist_drivers, &i,

start ? &start->p->knode_bus : NULL);

while ((drv = next_driver(&i)) && !error)

error = fn(drv, data);

klist_iter_exit(&i);

return error;

}

bus_for_each_drv()对bus的驱动链表中的每个驱动调用指定的函数。

这和前面的bus_for_each_dev/bus_find_dev什么都是类似的，只是你可能怀疑为什么会没有bus_find_drv。是没有它的用武之地吗？

请看driver.c中的driver_find()函数。

struct device_driver *driver_find(const char *name, struct bus_type *bus)

{

struct kobject *k = kset_find_obj(bus->p->drivers_kset, name);

struct driver_private *priv;

if (k) {

priv = to_driver(k);

return priv->driver;

}

return NULL;

}

driver_find()函数是在bus的驱动链表中寻找指定名称的驱动，它的存在证明bus_find_drv()完全是用得上的。可linux却偏偏没有实现bus_find_drv。driver_find()的实现也因此一直走内层路线，它直接用kset_find_obj()进行kobect的名称匹配，调用to_driver()等内容将kobj转化为drv。首先这完全不同于bus_for_each_drv()等一系列遍历函数，它们走的都是在klist中寻找的路线，这里确实走的sysfs中kset内部链表。其次，这里其实也是获得了drv的一个引用计数，在kset_find_obj()中会增加匹配的kobj的引用计数，driver_find()并没有释放，就相当于获取了drv的一个引用计数。这样虽然也可以，但代码写得很不优雅。可见人无完人，linux代码还有许多可改进之处。当然，也可能在最新的linux版本中已经改正了。

static int bus_add_attrs(struct bus_type *bus)

{

int error = 0;

int i;

if (bus->bus_attrs) {

for (i = 0; attr_name(bus->bus_attrs[i]); i++) {

error = bus_create_file(bus, &bus->bus_attrs[i]);

if (error)

goto err;

}

}

done:

return error;

err:

while (–i >= 0)

bus_remove_file(bus, &bus->bus_attrs[i]);

goto done;

}

static void bus_remove_attrs(struct bus_type *bus)

{

int i;

if (bus->bus_attrs) {

for (i = 0; attr_name(bus->bus_attrs[i]); i++)

bus_remove_file(bus, &bus->bus_attrs[i]);

}

}

bus_add_attrs()将bus->bus_attrs中定义的属性加入bus目录。

bus_remove_attrs()将bus->bus_attrs中定义的属性删除。

开始看struct bus_type时我们说到结构中的bus_attrs、dev_attrs、drv_attrs三种属性，后两者分别在device_add_attrs()和driver_add_attrs()中添加，最后的bus_attrs也终于在bus_add_attrs()中得到添加。只是它们虽然都定义在bus_type中，确实添加在完全不同的三个地方。

static void klist_devices_get(struct klist_node *n)

{

struct device_private *dev_prv = to_device_private_bus(n);

struct device *dev = dev_prv->device;

get_device(dev);

}

static void klist_devices_put(struct klist_node *n)

{

struct device_private *dev_prv = to_device_private_bus(n);

struct device *dev = dev_prv->device;

put_device(dev);

}

klist_devices_get()用于bus设备链表上添加节点时增加对相应设备的引用。

klist_devices_put()用于bus设备链表上删除节点时减少对相应设备的引用。

相似的函数是device中的klist_children_get/klist_children_put，这是device的子设备链表。除此之外，bus的驱动链表和driver的设备链表，都没有这种引用计数的保护。原因还未知，也许是linux觉得驱动不太靠谱，万一突然当掉，也不至于影响device的正常管理。

* bus_register – register a bus with the system.

* @bus: bus.

* Once we have that, we registered the bus with the kobject

* infrastructure, then register the children subsystems it has:

* the devices and drivers that belong to the bus.

int bus_register(struct bus_type *bus)

{

int retval;

struct bus_type_private *priv;

priv = kzalloc(sizeof(struct bus_type_private), GFP_KERNEL);

if (!priv)

return -ENOMEM;

priv->bus = bus;

bus->p = priv;

BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier);

retval = kobject_set_name(&priv->subsys.kobj, “%s”, bus->name);

if (retval)

goto out;

priv->subsys.kobj.kset = bus_kset;

priv->subsys.kobj.ktype = &bus_ktype;

priv->drivers_autoprobe = 1;

retval = kset_register(&priv->subsys);

if (retval)

goto out;

retval = bus_create_file(bus, &bus_attr_uevent);

if (retval)

goto bus_uevent_fail;

priv->devices_kset = kset_create_and_add(“devices”, NULL,

&priv->subsys.kobj);

if (!priv->devices_kset) {

retval = -ENOMEM;

goto bus_devices_fail;

}

priv->drivers_kset = kset_create_and_add(“drivers”, NULL,

&priv->subsys.kobj);

if (!priv->drivers_kset) {

retval = -ENOMEM;

goto bus_drivers_fail;

}

klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put);

klist_init(&priv->klist_drivers, NULL, NULL);

retval = add_probe_files(bus);

if (retval)

goto bus_probe_files_fail;

retval = bus_add_attrs(bus);

if (retval)

goto bus_attrs_fail;

pr_debug(“bus: ‘%s’: registered\n”, bus->name);

return 0;

bus_attrs_fail:

remove_probe_files(bus);

bus_probe_files_fail:

kset_unregister(bus->p->drivers_kset);

bus_drivers_fail:

kset_unregister(bus->p->devices_kset);

bus_devices_fail:

bus_remove_file(bus, &bus_attr_uevent);

bus_uevent_fail:

kset_unregister(&bus->p->subsys);

kfree(bus->p);

out:

bus->p = NULL;

return retval;

}

bus_register()将bus注册到系统中。

先分配并初始化bus->p，名称使用bus->name，所属的kset使用bus_kset（果然不出所料），类型使用bus_ktype。bus_ktype的使用同driver中的driver_ktype，和device中的device_ktype一样，都是自定义的kobj_type，要知道kobj_type的使用关系到release函数，和自定义属性类型能否正常发挥。

调用kset_register()将bus加入sysfs，因为只是设置了kset，所以会被加入/sys/bus目录下。与driver直接加入相关总线的drivers目录类似，却是与device复杂的寻找父节点过程相去甚远。

在bus目录下添加uevent属性。

在bus目录下创建devices子目录。它是一个kset类型的，目的是展示bus下的设备链表。

在bus目录下创建drivers子目录。它也是一个kset类型的，目的是展示bus下的驱动链表。

或许在最开始有设备驱动模型时，还需要kset来表达这种链表关系，但随着klist等结构的加入，kset的作用也越来越少，现在更多的作用是用来处理uevent消息。

之后初始化bus的设备链表和驱动链表，其中设备链表会占用设备的引用计数。

调用add_probe_files()在bus目录下添加probe相关的两个属性文件。

调用bus_add_attrs添加bus结构中添加的属性。

bus_register()中的操作出乎意料的简单。bus既不需要在哪里添加软链接，也不需要主动向谁报道，从来都是device和driver到bus这里报道的。所以bus_register()中只需要初始一下结构，添加到sysfs中，添加相关的子目录和属性文件，就行了。

void bus_unregister(struct bus_type *bus)

{

pr_debug(“bus: ‘%s’: unregistering\n”, bus->name);

bus_remove_attrs(bus);

remove_probe_files(bus);

kset_unregister(bus->p->drivers_kset);

kset_unregister(bus->p->devices_kset);

bus_remove_file(bus, &bus_attr_uevent);

kset_unregister(&bus->p->subsys);

kfree(bus->p);

bus->p = NULL;

}

bus_unregister()与bus_register()相对，将bus从系统中注销。不过要把bus注销也不是那么简单的，bus中的driver和device都对bus保有一份引用计数。或许正是如此，bus把释放bus->p的动作放在了bus_unregister()中，这至少能保证较早地释放不需要的内存空间。而且在bus引用计数用完时，也不会有任何操作，bus的容错性还是很高的。

static struct bus_type *bus_get(struct bus_type *bus)

{

if (bus) {

kset_get(&bus->p->subsys);

return bus;

}

return NULL;

}

static void bus_put(struct bus_type *bus)

{

if (bus)

kset_put(&bus->p->subsys);

}

bus_get()增加对bus的引用计数，bus_put()减少对bus的引用计数。实际上这里bus的引用计数降为零时，只是将sysfs中bus对应的目录删除。

无论是bus，还是device，还是driver，都是将主要的注销工作放在相关的unregister中。至于在引用计数降为零时的操作，大概只在device_release()中可见。这主要是因为引用计数，虽然是广泛用在设备驱动模型中，但实际支持的，绝大部分是设备的热插拔，而不是总线或者驱动的热插拔。当然，桥设备的热插拔也可能附带总线的热插拔。

* Yes, this forcably breaks the klist abstraction temporarily. It

* just wants to sort the klist, not change reference counts and

* take/drop locks rapidly in the process. It does all this while

* holding the lock for the list, so objects can’t otherwise be

* added/removed while we’re swizzling.

static void device_insertion_sort_klist(struct device *a, struct list_head *list,

int (*compare)(const struct device *a,

const struct device *b))

{

struct list_head *pos;

struct klist_node *n;

struct device_private *dev_prv;

struct device *b;

list_for_each(pos, list) {

n = container_of(pos, struct klist_node, n_node);

dev_prv = to_device_private_bus(n);

b = dev_prv->device;

if (compare(a, b) <= 0) {

list_move_tail(&a->p->knode_bus.n_node,

&b->p->knode_bus.n_node);

return;

}

}

list_move_tail(&a->p->knode_bus.n_node, list);

}

void bus_sort_breadthfirst(struct bus_type *bus,

int (*compare)(const struct device *a,

const struct device *b))

{

LIST_HEAD(sorted_devices);

struct list_head *pos, *tmp;

struct klist_node *n;

struct device_private *dev_prv;

struct device *dev;

struct klist *device_klist;

device_klist = bus_get_device_klist(bus);

spin_lock(&device_klist->k_lock);

list_for_each_safe(pos, tmp, &device_klist->k_list) {

n = container_of(pos, struct klist_node, n_node);

dev_prv = to_device_private_bus(n);

dev = dev_prv->device;

device_insertion_sort_klist(dev, &sorted_devices, compare);

}

list_splice(&sorted_devices, &device_klist->k_list);

spin_unlock(&device_klist->k_lock);

}

bus_sort_breadthfirst()是将bus的设备链表进行排序，使用指定的比较函数，排成降序。

本节主要分析了bus的注册注销过程，下节我们将深入分析device和driver的绑定过程，了解bus在这其中到底起了什么作用。随着我们了解的逐渐深入，未知的东西也在逐渐增多。但饭要一口一口吃，我们的分析也要一点一点来，急不得。

前面我们分析了device、driver、bus三种类型，主要是三者的注册与注销，在sysfs中的目录与属性文件创建等内容。本节就来详细分析下，在设备注册到总线上时，总线是如何为其寻找对应的驱动的；在驱动注册到总线上时，总线又是如何为其寻找对应的设备的。

本节的实现代码集中在drivers/base/bus.c和drivers/base/dd.c中。

先来回忆下，在device_register()->device_add()中，先是调用bus_add_device()添加device与bus间的联系，并添加bus为device定义的属性，然后会调用bus_probe_device()。bus_probe_device()会试图为已挂在总线上的该设备寻找对应的驱动。我们的故事就从这里开始。

* bus_probe_device – probe drivers for a new device

* @dev: device to probe

* – Automatically probe for a driver if the bus allows it.

void bus_probe_device(struct device *dev)

{

struct bus_type *bus = dev->bus;

int ret;

if (bus && bus->p->drivers_autoprobe) {

ret = device_attach(dev);

WARN_ON(ret < 0);

}

}

bus_probe_device()为总线上的设备寻找驱动。它先是检查bus->p->drivers_autoprobe，看是否允许自动探测。允许了才会调用device_attach()进行实际的寻找工作。

说到bus->p->drivers_autoprobe这个变量，它是在bus_type_private中的，在调用bus_register()前都初始化不了，在bus_register()中自动定为1。所以，除非是用户空间通过drivers_autoprobe属性文件主动禁止，bus总是允许自动探测的，所有的bus都是如此。

* device_attach – try to attach device to a driver.

* @dev: device.

* Walk the list of drivers that the bus has and call

* driver_probe_device() for each pair. If a compatible

* pair is found, break out and return.

* Returns 1 if the device was bound to a driver;

* 0 if no matching driver was found;

* -ENODEV if the device is not registered.

* When called for a USB interface, @dev->parent->sem must be held.

int device_attach(struct device *dev)

{

int ret = 0;

down(&dev->sem);

if (dev->driver) {

ret = device_bind_driver(dev);

if (ret == 0)

ret = 1;

else {

dev->driver = NULL;

ret = 0;

}

} else {

pm_runtime_get_noresume(dev);

ret = bus_for_each_drv(dev->bus, NULL, dev, __device_attach);

pm_runtime_put_sync(dev);

}

up(&dev->sem);

return ret;

}

device_attach()在实际绑定之前，会用dev->sem进行加锁。不错，dev->sem几乎就是为了在设备与驱动绑定或者解除绑定时加锁用的。还没有看到它在其它地方被调用。

如果在调用device_attach()前就已经有了dev->driver()，就调用device_bind_driver()进行绑定，不然还要调用bus_for_each_drv()进行依次匹配。至于pm_runtime_get_noresume之类的函数，属于电源管理部分，我们现在先忽略。

static void driver_bound(struct device *dev)

{

if (klist_node_attached(&dev->p->knode_driver)) {

printk(KERN_WARNING “%s: device %s already bound\n”,

__func__, kobject_name(&dev->kobj));

return;

}

pr_debug(“driver: ‘%s’: %s: bound to device ‘%s’\n”, dev_name(dev),

__func__, dev->driver->name);

if (dev->bus)

blocking_notifier_call_chain(&dev->bus->p->bus_notifier,

BUS_NOTIFY_BOUND_DRIVER, dev);

klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices);

}

static int driver_sysfs_add(struct device *dev)

{

int ret;

ret = sysfs_create_link(&dev->driver->p->kobj, &dev->kobj,

kobject_name(&dev->kobj));

if (ret == 0) {

ret = sysfs_create_link(&dev->kobj, &dev->driver->p->kobj,

“driver”);

if (ret)

sysfs_remove_link(&dev->driver->p->kobj,

kobject_name(&dev->kobj));

}

return ret;

}

static void driver_sysfs_remove(struct device *dev)

{

struct device_driver *drv = dev->driver;

if (drv) {

sysfs_remove_link(&drv->p->kobj, kobject_name(&dev->kobj));

sysfs_remove_link(&dev->kobj, “driver”);

}

}

* device_bind_driver – bind a driver to one device.

* @dev: device.

* Allow manual attachment of a driver to a device.

* Caller must have already set @dev->driver.

* Note that this does not modify the bus reference count

* nor take the bus’s rwsem. Please verify those are accounted

* for before calling this. (It is ok to call with no other effort

* from a driver’s probe() method.)

* This function must be called with @dev->sem held.

int device_bind_driver(struct device *dev)

{

int ret;

ret = driver_sysfs_add(dev);

if (!ret)

driver_bound(dev);

return ret;

}

device_bind_driver()将device与driver绑定。它调用了两个内部函数。

其中drivers_sysfs_add()负责创建sysfs中driver和device指向对方的软链接。还有一个与它相对的函数drivers_sysfs_remove()。

driver_bound()则实际将device加入驱动的设备链表。

因为在调用device_bind_driver()之前就已经设置过dev->driver了，所以这样就将device和driver绑定了。

只是这样好像还缺少了什么，不错，之前看到driver时曾定义了drv->probe函数，bus->probe也有类似的功能，这里只是绑定，却没有调用probe函数。

让我们回过头来，继续看如果device_attach()中没有定义dev->driver会怎么样，是用bus_for_each_drv()对bus的驱动链表进行遍历，遍历函数使用__device_attach。

static int __device_attach(struct device_driver *drv, void *data)

{

struct device *dev = data;

if (!driver_match_device(drv, dev))

return 0;

return driver_probe_device(drv, dev);

}

不要小看了__device_attach()，就是在__device_attach()中既完成了匹配工作，又完成了绑定工作。bus_for_each_drv()在遍历中，如果遍历函数返回值不为0，则遍历结束。所以在__device_attach()找到并绑定了适合的驱动，就会返回1停止遍历，否则继续遍历剩余的驱动。

先来看匹配工作，这是在driver_match_device()中完成的。

static inline int driver_match_device(struct device_driver *drv,

struct device *dev)

{

return drv->bus->match ? drv->bus->match(dev, drv) : 1;

}

原来driver_match_device()实际是调用drv->bus->match()来完成设备和驱动的匹配的。其实这也是理所当然。因为总线不同，总线规范设备、厂商、类设备等定义的规格都不同，也只有bus亲自主持匹配工作。再具体的就只能等分析具体总线的时候了。

int driver_probe_device(struct device_driver *drv, struct device *dev)

{

int ret = 0;

if (!device_is_registered(dev))

return -ENODEV;

pr_debug(“bus: ‘%s’: %s: matched device %s with driver %s\n”,

drv->bus->name, __func__, dev_name(dev), drv->name);

pm_runtime_get_noresume(dev);

pm_runtime_barrier(dev);

ret = really_probe(dev, drv);

pm_runtime_put_sync(dev);

return ret;

}

如果driver_match_device()匹配成功了，__device_attach()就会继续调用driver_probe_devices()完成绑定。但driver_probe_devices()又是调用really_probe()完成的。

static atomic_t probe_count = ATOMIC_INIT(0);

static DECLARE_WAIT_QUEUE_HEAD(probe_waitqueue);

static int really_probe(struct device *dev, struct device_driver *drv)

{

int ret = 0;

atomic_inc(&probe_count);

pr_debug(“bus: ‘%s’: %s: probing driver %s with device %s\n”,

drv->bus->name, __func__, drv->name, dev_name(dev));

WARN_ON(!list_empty(&dev->devres_head));

dev->driver = drv;

if (driver_sysfs_add(dev)) {

printk(KERN_ERR “%s: driver_sysfs_add(%s) failed\n”,

__func__, dev_name(dev));

goto probe_failed;

}

if (dev->bus->probe) {

ret = dev->bus->probe(dev);

if (ret)

goto probe_failed;

} else if (drv->probe) {

ret = drv->probe(dev);

if (ret)

goto probe_failed;

}

driver_bound(dev);

ret = 1;

pr_debug(“bus: ‘%s’: %s: bound device %s to driver %s\n”,

drv->bus->name, __func__, dev_name(dev), drv->name);

goto done;

probe_failed:

devres_release_all(dev);

driver_sysfs_remove(dev);

dev->driver = NULL;

if (ret != -ENODEV && ret != -ENXIO) {

/* driver matched but the probe failed */

printk(KERN_WARNING

“%s: probe of %s failed with error %d\n”,

drv->name, dev_name(dev), ret);

}

* Ignore errors returned by ->probe so that the next driver can try

* its luck.

ret = 0;

done:

atomic_dec(&probe_count);

wake_up(&probe_waitqueue);

return ret;

}

really_probe()完成的绑定工作和device_bind_driver()差不多，只是它还会调用bus->probe或者drv->probe中定义的probe函数。

至于在really_probe()中使用probe_count保护，最后调用wake_up(&probe_waitqueue)，都是为了进行同步。

* driver_probe_done

* Determine if the probe sequence is finished or not.

* Should somehow figure out how to use a semaphore, not an atomic variable…

int driver_probe_done(void)

{

pr_debug(“%s: probe_count = %d\n”, __func__,

atomic_read(&probe_count));

if (atomic_read(&probe_count))

return -EBUSY;

return 0;

}

* wait_for_device_probe

* Wait for device probing to be completed.

void wait_for_device_probe(void)

{

/* wait for the known devices to complete their probing */

wait_event(probe_waitqueue, atomic_read(&probe_count) == 0);

async_synchronize_full();

}

driver_probe_done()检查当前是否有设备正在绑定驱动。

wait_for_device_probe()会阻塞到所有的设备绑定完驱动。

关于bus_probe_device()的过程就分析到这里，下面来看下bus_add_driver()又是怎样做的。

之前我们已经知道driver_register()把绝大部分操作都移到了bus_add_driver()中来。其中只有一点和设备与驱动的绑定相关，就是对driver_attach()的调用。

int driver_attach(struct device_driver *drv)

{

return bus_for_each_dev(drv->bus, NULL, drv, __driver_attach);

}

driver_attach()一如device_attach，只是这里是对总线的设备链表进行遍历，使用的遍历函数是__driver_attach()。

static int __driver_attach(struct device *dev, void *data)

{

struct device_driver *drv = data;

* Lock device and try to bind to it. We drop the error

* here and always return 0, because we need to keep trying

* to bind to devices and some drivers will return an error

* simply if it didn’t support the device.

* driver_probe_device() will spit a warning if there

* is an error.

if (!driver_match_device(drv, dev))

return 0;

if (dev->parent) /* Needed for USB */

down(&dev->parent->sem);

down(&dev->sem);

if (!dev->driver)

driver_probe_device(drv, dev);

up(&dev->sem);

if (dev->parent)

up(&dev->parent->sem);

return 0;

}

在__driver_attach()中，driver_match_device()就不说了，它是调到bus->match去的。

然后依然是加锁，调用driver_probe_device()函数。这就与__device_attach()的路径一致了。

不要以为就这样结束了，现在我们只是看到了把device和driver绑定到一起的方法，却没有看到解除绑定的方法。

既然绑定的方法是在设备和驱动注册的时候调用的，那解除绑定自然是在设备或驱动注销的时候。

还是先来看设备的，device_unregister()->device_del()会调用bus_remove_device()将设备从总线上删除。

bus_remove_device()是与bus_add_device()相对的，但也不仅如此，它还调用了device_release_driver()来解除与driver的绑定。

* device_release_driver – manually detach device from driver.

* @dev: device.

* Manually detach device from driver.

* When called for a USB interface, @dev->parent->sem must be held.

void device_release_driver(struct device *dev)

{

* If anyone calls device_release_driver() recursively from

* within their ->remove callback for the same device, they

* will deadlock right here.

down(&dev->sem);

__device_release_driver(dev);

up(&dev->sem);

}

* __device_release_driver() must be called with @dev->sem held.

* When called for a USB interface, @dev->parent->sem must be held as well.

static void __device_release_driver(struct device *dev)

{

struct device_driver *drv;

drv = dev->driver;

if (drv) {

pm_runtime_get_noresume(dev);

pm_runtime_barrier(dev);

driver_sysfs_remove(dev);

if (dev->bus)

blocking_notifier_call_chain(&dev->bus->p->bus_notifier,

BUS_NOTIFY_UNBIND_DRIVER,

dev);

if (dev->bus && dev->bus->remove)

dev->bus->remove(dev);

else if (drv->remove)

drv->remove(dev);

devres_release_all(dev);

dev->driver = NULL;

klist_remove(&dev->p->knode_driver);

if (dev->bus)

blocking_notifier_call_chain(&dev->bus->p->bus_notifier,

BUS_NOTIFY_UNBOUND_DRIVER,

dev);

pm_runtime_put_sync(dev);

}

}

device_release_driver()还是负责加加锁，实际的工作由__device_release_driver()来完成。

除了sysfs和结构中解除绑定的操作，还调用了bus->remove或者driver->remove。

虽然device注销时与driver解除绑定很简单，但driver注销要与device解除绑定就要复杂一些，因为它要与设备链表上所有的设备解除绑定。

在driver_unregister()->bus_remove_driver()中，调用了driver_detach()函数。

* driver_detach – detach driver from all devices it controls.

* @drv: driver.

void driver_detach(struct device_driver *drv)

{

struct device_private *dev_prv;

struct device *dev;

for (;;) {

spin_lock(&drv->p->klist_devices.k_lock);

if (list_empty(&drv->p->klist_devices.k_list)) {

spin_unlock(&drv->p->klist_devices.k_lock);

break;

}

dev_prv = list_entry(drv->p->klist_devices.k_list.prev,

struct device_private,

knode_driver.n_node);

dev = dev_prv->device;

get_device(dev);

spin_unlock(&drv->p->klist_devices.k_lock);

if (dev->parent) /* Needed for USB */

down(&dev->parent->sem);

down(&dev->sem);

if (dev->driver == drv)

__device_release_driver(dev);

up(&dev->sem);

if (dev->parent)

up(&dev->parent->sem);

put_device(dev);

}

}

可以看到，driver_detach()基本操作就是与设备链表上的设备解除绑定。等了这么久，终于有个有点意思的地方。一看这个drv的设备链表遍历，首先明明是klist，却没使用标准的循环函数，奇怪，然后发现竟然没有将设备卸下链表的地方，更奇怪。其实再一想就明白了。你看到list_entry()中，是从设备链表末尾取设备解除绑定的，这是驱动生怕前面的设备解除绑定了，后面的就不工作了。也正是因为klist遍历是逆向的，所以无法使用标准函数。至于将设备卸下链表的地方，是在__device_release_driver()中。

或许会奇怪这里为什么会有get_device()和put_device()的操作。这是为了防止设备一取下链表，就会释放最后一个引用计数，导致直接注销。那时候的情况，一定是在占用了dev->sem的同时去等待dev->sem，通俗来说就是死锁。

通过driver_attach()和driver_detach()的训练，我们已经习惯在为设备加锁时，顺便为其父设备加锁。虽然在device_attach()和device_release_driver()中只是对设备本身加锁。或许是害怕在驱动与设备解除绑定的过程中，父设备突然也要解除绑定，导致不一致状态。为至于为什么设备方主动要求时不需要对父设备加锁，或许是设备的主动申请更靠谱，不会在子设备绑定或释放的同时，父设备也申请释放。总之，在linux看来，设备恐怕比驱动还要靠谱一些，从driver和bus的引用计数，从这里的加锁情况，都可以看出一二。

void *dev_get_drvdata(const struct device *dev)

{

if (dev && dev->p)

return dev->p->driver_data;

return NULL;

}

void dev_set_drvdata(struct device *dev, void *data)

{

int error;

if (!dev)

return;

if (!dev->p) {

error = device_private_init(dev);

if (error)

return;

}

dev->p->driver_data = data;

}

最后的dev_set_drvdata()是在dev->p->driver_data中存放驱动定义的数据。dev_get_drvdata()是获取这个数据。

不要小看这个device_private结构中小小的driver_data，在驱动编写中总能派上大用场。当然也不是说没有driver_data就过不下去，毕竟驱动可以定义一个自己的device结构，并把通用的struct device内嵌其中，然后想放多少数据都行。可那样太麻烦，许多驱动都要专门设置这样一个变量，索性加到通用的数据结构中。而且是直接加到device_private中，眼不见为净，方便省事。

* device_reprobe – remove driver for a device and probe for a new driver

* @dev: the device to reprobe

* This function detaches the attached driver (if any) for the given

* device and restarts the driver probing process. It is intended

* to use if probing criteria changed during a devices lifetime and

* driver attachment should change accordingly.

int device_reprobe(struct device *dev)

{

if (dev->driver) {

if (dev->parent) /* Needed for USB */

down(&dev->parent->sem);

device_release_driver(dev);

if (dev->parent)

up(&dev->parent->sem);

}

return bus_rescan_devices_helper(dev, NULL);

}

device_reprobe()显然是dev对之前的驱动不满意，要新绑定一个。

static int __must_check bus_rescan_devices_helper(struct device *dev,

void *data)

{

int ret = 0;

if (!dev->driver) {

if (dev->parent) /* Needed for USB */

down(&dev->parent->sem);

ret = device_attach(dev);

if (dev->parent)

up(&dev->parent->sem);

}

return ret < 0 ? ret : 0;

}

bus_rescan_devices_helper()就是用来绑定新驱动的内部函数。

我们终于成功完成了对dd.c的分析，并将bus.c剩余的部分结了尾。想必大家已经充分领略了device、driver和bus的铁三角结构，下节我们将进入设备驱动模型的另一方天地。

Linux内核部件分析

相关推荐

分类

听说打赏我的人，都进福布斯排行榜啦！

支付宝扫一扫打赏

微信扫一扫打赏

QQ咨询

回顶部