乡下人产国偷v产偷v自拍,国产午夜片在线观看,婷婷成人亚洲综合国产麻豆,久久综合给合久久狠狠狠9

  • <output id="e9wm2"></output>
    <s id="e9wm2"><nobr id="e9wm2"><ins id="e9wm2"></ins></nobr></s>

    • 分享

      Linux存儲(chǔ)IO棧

       waston 2019-03-08
      本系列文章將自底向上分析Linux存儲(chǔ)IO棧源碼(基于4.4.19),為學(xué)習(xí)Linux存儲(chǔ)做記錄。具體目錄如下:
      一、 Linux內(nèi)核對(duì)象與對(duì)象集
      二、 sysfs
      三、 設(shè)備模型
      四、 SCSI子系統(tǒng)

      五、 SCSI磁盤驅(qū)動(dòng)sd

      六、 SCSI Target--TCM

      七、 用戶空間IO--UIO

      八、 在用戶空間實(shí)現(xiàn)虛擬SCSI磁盤--TCMU

      九、 通用塊層

      十、文件系統(tǒng)--VFS


      Linux內(nèi)核對(duì)象和對(duì)象集

      內(nèi)核對(duì)象作為Linux設(shè)備驅(qū)動(dòng)模型的基礎(chǔ),主要是抽象和封裝總線、設(shè)備、驅(qū)動(dòng)、類和接口之間的關(guān)系具體實(shí)現(xiàn)的相關(guān)代碼,并在sysfs中呈現(xiàn)。主要抽象成kobject和kset結(jié)構(gòu):

      struct kobject {
          const char      *name;   //在sysfs中顯示的名稱
          struct list_head    entry;   //鏈入kset的kobj鏈表
          struct kobject      *parent; //指向父kobject,用于表示樹形結(jié)構(gòu)
          struct kset     *kset;   //指向鏈入的kset
          struct kobj_type    *ktype;  //抽象kobject的通用方法和屬性
          struct kernfs_node  *sd;     //sysfs directory entry 
          struct kref     kref;    //引用計(jì)數(shù) 
      #ifdef CONFIG_DEBUG_KOBJECT_RELEASE
          struct delayed_work release; 
      #endif
          unsigned int state_initialized:1;  //是否被初始化
          unsigned int state_in_sysfs:1;     //是否被添加到sysfs
          unsigned int state_add_uevent_sent:1; //是否發(fā)送ADD事件到用戶空間
          unsigned int state_remove_uevent_sent:1; //是否發(fā)送REMOVE事件到用戶空間
          unsigned int uevent_suppress:1; //事件是否被抑制
      };

      在kobject結(jié)構(gòu)中ktype域是對(duì)kobject一些通用方法和屬性進(jìn)行封裝:

      struct kobj_type {
          void (*release)(struct kobject *kobj); //釋放kobject結(jié)構(gòu)時(shí)回調(diào)
          const struct sysfs_ops *sysfs_ops; //sysfs的操作函數(shù)
          struct attribute **default_attrs;  //默認(rèn)屬性
              //命名空間相關(guān)操作
          const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj);
          const void *(*namespace)(struct kobject *kobj);
      };

      kset是一組kobject的集合,通過kset可以遍歷這組kobject,如SCSI子系統(tǒng)中,設(shè)備是一種kobject,通過設(shè)備集kset,可以遍歷所有的設(shè)備。

      /**
       * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem.
       *
       * A kset defines a group of kobjects.  They can be individually
       * different "types" but overall these kobjects all want to be grouped
       * together and operated on in the same manner.  ksets are used to
       * define the attribute callbacks and other common events that happen to
       * a kobject.
       *
       * @list: the list of all kobjects for this kset
       * @list_lock: a lock for iterating over the kobjects
       * @kobj: the embedded kobject for this kset (recursion, isn't it fun...)
       * @uevent_ops: the set of uevent operations for this kset.  These are
       * called whenever a kobject has something happen to it so that the kset
       * can add new environment variables, or filter out the uevents if so
       * desired.
       */
      struct kset {
          struct list_head list; //鏈入kset的kobject鏈表
          spinlock_t list_lock;  //遍歷鏈表是的自旋鎖struct kobject kobj;   //本身可以當(dāng)做kobject對(duì)待
          const struct kset_uevent_ops *uevent_ops; //發(fā)送uevent事件的回調(diào)函數(shù)
      };

      在發(fā)送事件到用戶空間時(shí),可以回調(diào)kset_uevent_ops中的3個(gè)回調(diào)函數(shù)

      struct kset_uevent_ops {
          int (* const filter)(struct kset *kset, struct kobject *kobj);
          const char *(* const name)(struct kset *kset, struct kobject *kobj);
          int (* const uevent)(struct kset *kset, struct kobject *kobj,
                    struct kobj_uevent_env *env);
      };
      • filter:在發(fā)送事件之前的過濾某些事件。

      • name: 獲取名稱。

      • uevent:設(shè)置uevent需要的環(huán)境變量。

      內(nèi)核對(duì)象關(guān)系

      內(nèi)核對(duì)象相關(guān)操作

      void kobject_init(struct kobject *kobj, struct kobj_type *ktype);
      int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...);
      int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...);
      void kobject_del(struct kobject *kobj);
      struct kobject *  kobject_create(void);
      struct kobject * kobject_create_and_add(const char *name, struct kobject *parent);
      int kobject_rename(struct kobject *, const char *new_name);
      int kobject_move(struct kobject *, struct kobject *);
      struct kobject *kobject_get(struct kobject *kobj);
      void kobject_put(struct kobject *kobj);
      const void *kobject_namespace(struct kobject *kobj);
      char *kobject_get_path(struct kobject *kobj, gfp_t flag);

      內(nèi)核對(duì)象創(chuàng)建及初始化

      初始化流程主要在kobject_init:

      /**
       * kobject_init - initialize a kobject structure
       * @kobj: pointer to the kobject to initialize
       * @ktype: pointer to the ktype for this kobject.
       *
       * This function will properly initialize a kobject such that it can then
       * be passed to the kobject_add() call.
       *
       * After this function is called, the kobject MUST be cleaned up by a call
       * to kobject_put(), not by a call to kfree directly to ensure that all of
       * the memory is cleaned up properly.
       */
      void kobject_init(struct kobject *kobj, struct kobj_type *ktype)
      {
          char *err_str;
      
          if (!kobj) {
              err_str = "invalid kobject pointer!";
              goto error;
          }
          if (!ktype) {
              err_str = "must have a ktype to be initialized properly!\n";
              goto error;
          }
          if (kobj->state_initialized) {  //避免重復(fù)初始化
              /* do not error out as sometimes we can recover */
              printk(KERN_ERR "kobject (%p): tried to init an initialized "
                     "object, something is seriously wrong.\n", kobj);
              dump_stack();
          }
      
          kobject_init_internal(kobj); //完成初始化的主要函數(shù)
          kobj->ktype = ktype;
          return;
      
      error:
          printk(KERN_ERR "kobject (%p): %s\n", kobj, err_str);
          dump_stack();
      }
      EXPORT_SYMBOL(kobject_init);

      由上面函數(shù)可以看出由kobject_init_internal完成初始化:

      static void kobject_init_internal(struct kobject *kobj)
      {
          if (!kobj)
              return;
          kref_init(&kobj->kref);
          INIT_LIST_HEAD(&kobj->entry);
          kobj->state_in_sysfs = 0;
          kobj->state_add_uevent_sent = 0;
          kobj->state_remove_uevent_sent = 0;
          kobj->state_initialized = 1;
      }

      kobject_create函數(shù)僅僅是在調(diào)用kobject_init之前,先分配kobject空間。在kobject初始化之后,需要調(diào)用kobject_add將kobject添加到sysfs中。

      /**
       * kobject_add - the main kobject add function
       * @kobj: the kobject to add
       * @parent: pointer to the parent of the kobject.
       * @fmt: format to name the kobject with.
       *
       * The kobject name is set and added to the kobject hierarchy in this
       * function.
       *
       * If @parent is set, then the parent of the @kobj will be set to it.
       * If @parent is NULL, then the parent of the @kobj will be set to the
       * kobject associated with the kset assigned to this kobject.  If no kset
       * is assigned to the kobject, then the kobject will be located in the
       * root of the sysfs tree.
       *
       * If this function returns an error, kobject_put() must be called to
       * properly clean up the memory associated with the object.
       * Under no instance should the kobject that is passed to this function
       * be directly freed with a call to kfree(), that can leak memory.
       *
       * Note, no "add" uevent will be created with this call, the caller should set
       * up all of the necessary sysfs files for the object and then call
       * kobject_uevent() with the UEVENT_ADD parameter to ensure that
       * userspace is properly notified of this kobject's creation.
       */
      int kobject_add(struct kobject *kobj, struct kobject *parent,
              const char *fmt, ...)
      {
          va_list args;
          int retval;
      
          if (!kobj)
              return -EINVAL;
      
          if (!kobj->state_initialized) { //add之前需要初始化
              printk(KERN_ERR "kobject '%s' (%p): tried to add an "
                     "uninitialized object, something is seriously wrong.\n",
                     kobject_name(kobj), kobj);
              dump_stack();
              return -EINVAL;
          }
          va_start(args, fmt);
          retval = kobject_add_varg(kobj, parent, fmt, args); //主要完成add操作
          va_end(args);
      
          return retval;
      }
      EXPORT_SYMBOL(kobject_add);

      kobject_add_varg/kobject_add_internal主要完成將kobject添加到sysfs的操作:

      static __printf(3, 0) int kobject_add_varg(struct kobject *kobj,
                             struct kobject *parent,
                             const char *fmt, va_list vargs)
      {
          int retval;
              //設(shè)置kobject在sysfs中顯示的名稱
          retval = kobject_set_name_vargs(kobj, fmt, vargs);
          if (retval) {
              printk(KERN_ERR "kobject: can not set name properly!\n");
              return retval;
          }
          kobj->parent = parent;
          return kobject_add_internal(kobj); //主要實(shí)現(xiàn)函數(shù)
      }
      
      static int kobject_add_internal(struct kobject *kobj)
      {
          int error = 0;
          struct kobject *parent;
      
          if (!kobj)
              return -ENOENT;
      
          if (!kobj->name || !kobj->name[0]) {
              WARN(1, "kobject: (%p): attempted to be registered with empty "
                   "name!\n", kobj);
              return -EINVAL;
          }
      
          parent = kobject_get(kobj->parent); //增加父對(duì)象的引用計(jì)數(shù)
      
          /* join kset if set, use it as parent if we do not already have one */
          if (kobj->kset) { //如果設(shè)置了kset,而沒有設(shè)置parent,則把kset的kobject設(shè)置為parent
              if (!parent)
                  parent = kobject_get(&kobj->kset->kobj);
              kobj_kset_join(kobj);
              kobj->parent = parent;
          }
      
          pr_debug("kobject: '%s' (%p): %s: parent: '%s', set: '%s'\n",
               kobject_name(kobj), kobj, __func__,
               parent ? kobject_name(parent) : "<NULL>",
               kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>");
      
          error = create_dir(kobj);  //創(chuàng)建sysfs對(duì)應(yīng)的目錄和屬性文件
          if (error) {  //出錯(cuò)回滾
              kobj_kset_leave(kobj);
              kobject_put(parent);
              kobj->parent = NULL;
      
              /* be noisy on error issues */
              if (error == -EEXIST)
                  WARN(1, "%s failed for %s with "
                       "-EEXIST, don't try to register things with "
                       "the same name in the same directory.\n",
                       __func__, kobject_name(kobj));
              else
                  WARN(1, "%s failed for %s (error: %d parent: %s)\n",
                       __func__, kobject_name(kobj), error,
                       parent ? kobject_name(parent) : "'none'");
          } else
              kobj->state_in_sysfs = 1; //更新標(biāo)志位
      
          return error;
      }

      由create_dir在sysfs創(chuàng)建真實(shí)的目錄和文件,這點(diǎn)有下一篇sysfs詳細(xì)描述。理解了kobject_init和kobject_add之后,由名字可以知道下面函數(shù)kobject_init_and_add和kobject_create_and_add

      內(nèi)核對(duì)象釋放

      調(diào)用kobject_del將對(duì)kobject釋放:

      /**
       * kobject_del - unlink kobject from hierarchy.
       * @kobj: object.
       */
      void kobject_del(struct kobject *kobj)
      {
          struct kernfs_node *sd;
      
          if (!kobj)
              return;
      
          sd = kobj->sd;
          sysfs_remove_dir(kobj); //刪除kobject在sysfs中的目錄
          sysfs_put(sd);
      
          kobj->state_in_sysfs = 0; //設(shè)置標(biāo)志位
          kobj_kset_leave(kobj);  //kobject脫離kset鏈表
          kobject_put(kobj->parent); //調(diào)用kobject_release釋放
          kobj->parent = NULL;
      }
      EXPORT_SYMBOL(kobject_del);
      
      /**
       * kobject_put - decrement refcount for object.
       * @kobj: object.
       *
       * Decrement the refcount, and if 0, call kobject_cleanup().
       */
      void kobject_put(struct kobject *kobj)
      {
          if (kobj) {
              if (!kobj->state_initialized)
                  WARN(1, KERN_WARNING "kobject: '%s' (%p): is not "
                         "initialized, yet kobject_put() is being "
                         "called.\n", kobject_name(kobj), kobj);
              kref_put(&kobj->kref, kobject_release);  //調(diào)用kobject_release
          }
      }
      EXPORT_SYMBOL(kobject_put);
      
      static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
      {
          return kref_sub(kref, 1, release);
      }
      
      static inline int kref_sub(struct kref *kref, unsigned int count,
               void (*release)(struct kref *kref))
      {
          WARN_ON(release == NULL);
      
          if (atomic_sub_and_test((int) count, &kref->refcount)) {
              release(kref); //調(diào)用kobject_release
              return 1;
          }
          return 0;
      }

      根據(jù)上面的代碼追蹤,得知kobject_release才是釋放kobject的主角:

      static void kobject_release(struct kref *kref)
      {
          struct kobject *kobj = container_of(kref, struct kobject, kref);
      #ifdef CONFIG_DEBUG_KOBJECT_RELEASE
          unsigned long delay = HZ + HZ * (get_random_int() & 0x3);
          pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n",
               kobject_name(kobj), kobj, __func__, kobj->parent, delay);
          INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup);
          //延遲調(diào)用kobject_delayed_cleanup進(jìn)行清理
          schedule_delayed_work(&kobj->release, delay);
      #else
          kobject_cleanup(kobj);  //清理
      #endif
      }

      如果在內(nèi)核編譯時(shí)指定CONFIG_DEBUG_KOBJECT_RELEASE,則使用延遲release方式調(diào)用kobject_delayed_cleanup,否則直接調(diào)用kobject_cleanup。

      #ifdef CONFIG_DEBUG_KOBJECT_RELEASE
      static void kobject_delayed_cleanup(struct work_struct *work)
      {
          kobject_cleanup(container_of(to_delayed_work(work), //最終還是調(diào)用
                           struct kobject, release));
      }
      #endif
      
      /*
       * kobject_cleanup - free kobject resources.
       * @kobj: object to cleanup
       */
      static void kobject_cleanup(struct kobject *kobj)
      {
          struct kobj_type *t = get_ktype(kobj);
          const char *name = kobj->name;
      
          pr_debug("kobject: '%s' (%p): %s, parent %p\n",
               kobject_name(kobj), kobj, __func__, kobj->parent);
      
          if (t && !t->release)
              pr_debug("kobject: '%s' (%p): does not have a release() "
                   "function, it is broken and must be fixed.\n",
                   kobject_name(kobj), kobj);
      
          /* send "remove" if the caller did not do it but sent "add" */
          if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) {
              pr_debug("kobject: '%s' (%p): auto cleanup 'remove' event\n",
                   kobject_name(kobj), kobj);
              kobject_uevent(kobj, KOBJ_REMOVE); //僅僅發(fā)送一次REMOVE消息
          }
      
          /* remove from sysfs if the caller did not do it */
          if (kobj->state_in_sysfs) {
              pr_debug("kobject: '%s' (%p): auto cleanup kobject_del\n",
                   kobject_name(kobj), kobj);
              kobject_del(kobj); //如果調(diào)用者沒有清理sysfs,則清理
          }
      
          if (t && t->release) {
              pr_debug("kobject: '%s' (%p): calling ktype release\n",
                   kobject_name(kobj), kobj);
              t->release(kobj); //調(diào)用kobj_type的release回調(diào)函數(shù)
          }
      
          /* free name if we allocated it */
          if (name) {
              pr_debug("kobject: '%s': free name\n", name);
              kfree_const(name);
          }
      }

      內(nèi)核對(duì)象集相關(guān)操作

      void kset_init(struct kset *kset);
      struct kset *kset_create(const char *name, const struct kset_uevent_ops *uevent_ops, struct kobject *parent_kobj);
      int kset_register(struct kset *kset);
      void kset_unregister(struct kset *kset);
      struct kset * kset_create_and_add(const char *name, const struct kset_uevent_ops *u, struct kobject *parent_kobj);

      內(nèi)核對(duì)象集創(chuàng)建及初始化

      內(nèi)核對(duì)象集由kset_create創(chuàng)建

      /**
       * kset_create - create a struct kset dynamically
       *
       * @name: the name for the kset
       * @uevent_ops: a struct kset_uevent_ops for the kset
       * @parent_kobj: the parent kobject of this kset, if any.
       *
       * This function creates a kset structure dynamically.  This structure can
       * then be registered with the system and show up in sysfs with a call to
       * kset_register().  When you are finished with this structure, if
       * kset_register() has been called, call kset_unregister() and the
       * structure will be dynamically freed when it is no longer being used.
       *
       * If the kset was not able to be created, NULL will be returned.
       */
      static struct kset *kset_create(const char *name,
                      const struct kset_uevent_ops *uevent_ops,
                      struct kobject *parent_kobj)
      {
          struct kset *kset;
          int retval;
      
          kset = kzalloc(sizeof(*kset), GFP_KERNEL);  //分配空間
          if (!kset)
              return NULL;
          retval = kobject_set_name(&kset->kobj, "%s", name); //設(shè)置kset在sysfs中的名字
          if (retval) {
              kfree(kset);
              return NULL;
          }
          kset->uevent_ops = uevent_ops;   //設(shè)置uevent_ops
          kset->kobj.parent = parent_kobj; //設(shè)置kset的父對(duì)象
      
          /*
           * The kobject of this kset will have a type of kset_ktype and belong to
           * no kset itself.  That way we can properly free it when it is
           * finished being used.
           */
          kset->kobj.ktype = &kset_ktype;  //設(shè)置kobj_type
          kset->kobj.kset = NULL;
      
          return kset;
      }

      內(nèi)核對(duì)象集由kset_init執(zhí)行初始化:

      /**
       * kset_init - initialize a kset for use
       * @k: kset
       */
      void kset_init(struct kset *k)
      {
          kobject_init_internal(&k->kobj);  //這里初始化
          INIT_LIST_HEAD(&k->list);
          spin_lock_init(&k->list_lock);
      }
      
      static void kobject_init_internal(struct kobject *kobj)
      {
          if (!kobj)
              return;
          kref_init(&kobj->kref);
          INIT_LIST_HEAD(&kobj->entry);
          kobj->state_in_sysfs = 0;        //設(shè)置對(duì)應(yīng)標(biāo)志位
          kobj->state_add_uevent_sent = 0;
          kobj->state_remove_uevent_sent = 0;
          kobj->state_initialized = 1;
      }

      初始化kset之后,調(diào)用kset_register,將kset添加到sysfs:

      /**
       * kset_register - initialize and add a kset.
       * @k: kset.
       */
      int kset_register(struct kset *k)
      {
          int err;
      
          if (!k)
              return -EINVAL;
      
          kset_init(k);
          err = kobject_add_internal(&k->kobj); //完成register動(dòng)作,前面已說明
          if (err)
              return err;
          kobject_uevent(&k->kobj, KOBJ_ADD); //發(fā)送ADD事件到用戶空間
          return 0;
      }
      EXPORT_SYMBOL(kset_register);

      經(jīng)過kset_create, kset_init和kset_register之后,kset已初始化并添加完成。當(dāng)然kset_create_and_add包含了這三個(gè)函數(shù)。

      內(nèi)核對(duì)象集釋放

      內(nèi)核對(duì)象的釋放過程與kobject的釋放過程類似,由kset_unregister完成:

      /**
       * kset_unregister - remove a kset.
       * @k: kset.
       */
      void kset_unregister(struct kset *k)
      {
          if (!k)
              return;
          kobject_del(&k->kobj);  //刪除sysfs的目錄和屬性文件,前面已說明
          kobject_put(&k->kobj);  //與kobject釋放過程一致
      }
      EXPORT_SYMBOL(kset_unregister);

      發(fā)送事件到用戶空間

      由前面的代碼可以看到無論kobject或是kset,都會(huì)向用戶空間發(fā)送事件,由kobject_uevent函數(shù)通過設(shè)置環(huán)境變量的方式完成:

      struct kobj_uevent_env {
          char *argv[3];                //user_helper使用的命令
          char *envp[UEVENT_NUM_ENVP];  //環(huán)境變量數(shù)組
          int envp_idx;                 //當(dāng)前環(huán)境變量索引
          char buf[UEVENT_BUFFER_SIZE]; //環(huán)境變量數(shù)據(jù)緩沖區(qū)
          int buflen;
      };
      
      /**
       * kobject_uevent - notify userspace by sending an uevent
       *
       * @action: action that is happening
       * @kobj: struct kobject that the action is happening to
       *
       * Returns 0 if kobject_uevent() is completed with success or the
       * corresponding error when it fails.
       */
      int kobject_uevent(struct kobject *kobj, enum kobject_action action)
      {
          return kobject_uevent_env(kobj, action, NULL); //實(shí)際完成發(fā)送函數(shù)
      }
      EXPORT_SYMBOL_GPL(kobject_uevent);
      
      /**
       * kobject_uevent_env - send an uevent with environmental data
       *
       * @action: action that is happening
       * @kobj: struct kobject that the action is happening to
       * @envp_ext: pointer to environmental data
       *
       * Returns 0 if kobject_uevent_env() is completed with success or the
       * corresponding error when it fails.
       */
      int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
                     char *envp_ext[])
      {
          struct kobj_uevent_env *env;
          const char *action_string = kobject_actions[action];
          const char *devpath = NULL;
          const char *subsystem;
          struct kobject *top_kobj;
          struct kset *kset;
          const struct kset_uevent_ops *uevent_ops;
          int i = 0;
          int retval = 0;
      #ifdef CONFIG_NET
          struct uevent_sock *ue_sk;
      #endif
      
          pr_debug("kobject: '%s' (%p): %s\n",
               kobject_name(kobj), kobj, __func__);
      
          /* search the kset we belong to */
          top_kobj = kobj;
          while (!top_kobj->kset && top_kobj->parent)  //尋找最近的kset,kset中有鍀event_ops
              top_kobj = top_kobj->parent;
      
          if (!top_kobj->kset) {
              pr_debug("kobject: '%s' (%p): %s: attempted to send uevent "
                   "without kset!\n", kobject_name(kobj), kobj,
                   __func__);
              return -EINVAL;
          }
      
          kset = top_kobj->kset;
          uevent_ops = kset->uevent_ops;  //使用kset中的uevent_ops執(zhí)行發(fā)送操作
      
          /* skip the event, if uevent_suppress is set*/
          if (kobj->uevent_suppress) {  //跳過設(shè)置為uevent_suppress的kobject
              pr_debug("kobject: '%s' (%p): %s: uevent_suppress "
                       "caused the event to drop!\n",
                       kobject_name(kobj), kobj, __func__);
              return 0;
          }
          /* skip the event, if the filter returns zero. */
          if (uevent_ops && uevent_ops->filter)  //調(diào)用uevent_ops的filter函數(shù)
              if (!uevent_ops->filter(kset, kobj)) {
                  pr_debug("kobject: '%s' (%p): %s: filter function "
                       "caused the event to drop!\n",
                       kobject_name(kobj), kobj, __func__);
                  return 0;
              }
      
          /* originating subsystem */
          if (uevent_ops && uevent_ops->name)  //確定發(fā)送事件的kobject名字
              subsystem = uevent_ops->name(kset, kobj);
          else
              subsystem = kobject_name(&kset->kobj);
          if (!subsystem) {
              pr_debug("kobject: '%s' (%p): %s: unset subsystem caused the "
                   "event to drop!\n", kobject_name(kobj), kobj,
                   __func__);
              return 0;
          }
      
          /* environment buffer */
          env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); //分配kobj_uevent_env
          if (!env)
              return -ENOMEM;
      
          /* complete object path */
          devpath = kobject_get_path(kobj, GFP_KERNEL);
          if (!devpath) {
              retval = -ENOENT;
              goto exit;
          }
      
          /* default keys 添加環(huán)境變量 */
          retval = add_uevent_var(env, "ACTION=%s", action_string);
          if (retval)
              goto exit;
          retval = add_uevent_var(env, "DEVPATH=%s", devpath);
          if (retval)
              goto exit;
          retval = add_uevent_var(env, "SUBSYSTEM=%s", subsystem);
          if (retval)
              goto exit;
      
          /* keys passed in from the caller */
          if (envp_ext) {
              for (i = 0; envp_ext[i]; i++) {
                  retval = add_uevent_var(env, "%s", envp_ext[i]);
                  if (retval)
                      goto exit;
              }
          }
      
          /* let the kset specific function add its stuff */
          if (uevent_ops && uevent_ops->uevent) { //調(diào)用uevent回調(diào)函數(shù),添加子系統(tǒng)特定的環(huán)境變量
              retval = uevent_ops->uevent(kset, kobj, env);
              if (retval) {
                  pr_debug("kobject: '%s' (%p): %s: uevent() returned "
                       "%d\n", kobject_name(kobj), kobj,
                       __func__, retval);
                  goto exit;
              }
          }
      
          /*
           * Mark "add" and "remove" events in the object to ensure proper
           * events to userspace during automatic cleanup. If the object did
           * send an "add" event, "remove" will automatically generated by
           * the core, if not already done by the caller.
           */
          if (action == KOBJ_ADD)
              kobj->state_add_uevent_sent = 1;
          else if (action == KOBJ_REMOVE)
              kobj->state_remove_uevent_sent = 1;
      
          mutex_lock(&uevent_sock_mutex);
          /* we will send an event, so request a new sequence number */
          retval = add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++uevent_seqnum);
          if (retval) {
              mutex_unlock(&uevent_sock_mutex);
              goto exit;
          }
      
      #if defined(CONFIG_NET)  //如果在編譯時(shí)指定CONFIG_NET,使用netlink發(fā)送
          /* send netlink message */
          list_for_each_entry(ue_sk, &uevent_sock_list, list) {
              struct sock *uevent_sock = ue_sk->sk;
              struct sk_buff *skb;
              size_t len;
      
              if (!netlink_has_listeners(uevent_sock, 1))
                  continue;
      
              /* allocate message with the maximum possible size */
              len = strlen(action_string) + strlen(devpath) + 2;
              skb = alloc_skb(len + env->buflen, GFP_KERNEL);
              if (skb) {
                  char *scratch;
      
                  /* add header */
                  scratch = skb_put(skb, len);
                  sprintf(scratch, "%s@%s", action_string, devpath);
      
                  /* copy keys to our continuous event payload buffer */
                  for (i = 0; i < env->envp_idx; i++) {
                      len = strlen(env->envp[i]) + 1;
                      scratch = skb_put(skb, len);
                      strcpy(scratch, env->envp[i]);
                  }
      
                  NETLINK_CB(skb).dst_group = 1;
                  retval = netlink_broadcast_filtered(uevent_sock, skb, //使用netlink多播發(fā)送
                                      0, 1, GFP_KERNEL,
                                      kobj_bcast_filter,
                                      kobj);
                  /* ENOBUFS should be handled in userspace */
                  if (retval == -ENOBUFS || retval == -ESRCH)
                      retval = 0;
              } else
                  retval = -ENOMEM;
          }
      #endif
          mutex_unlock(&uevent_sock_mutex);
      
      #ifdef CONFIG_UEVENT_HELPER  //不能使用netlink時(shí),使用user_helper發(fā)送
          /* call uevent_helper, usually only enabled during early boot */
          if (uevent_helper[0] && !kobj_usermode_filter(kobj)) {
              struct subprocess_info *info;
      
              retval = add_uevent_var(env, "HOME=/");
              if (retval)
                  goto exit;
              retval = add_uevent_var(env,
                          "PATH=/sbin:/bin:/usr/sbin:/usr/bin");
              if (retval)
                  goto exit;
              retval = init_uevent_argv(env, subsystem); //組裝需要調(diào)用的用戶空間命令和參數(shù)
              if (retval)
                  goto exit;
      
              retval = -ENOMEM;
              info = call_usermodehelper_setup(env->argv[0], env->argv,  //調(diào)用用戶空間程序/sbin/hotplug
                               env->envp, GFP_KERNEL,
                               NULL, cleanup_uevent_env, env);
              if (info) {
                  retval = call_usermodehelper_exec(info, UMH_NO_WAIT);
                  env = NULL; /* freed by cleanup_uevent_env */
              }
          }
      #endif
      
      exit:
          kfree(devpath);
          kfree(env);
          return retval;
      }
      EXPORT_SYMBOL_GPL(kobject_uevent_env);

      sysfs與內(nèi)核對(duì)象

      本篇文章不是以文件系統(tǒng)的角度來詳細(xì)描述sysfs,而是從內(nèi)核對(duì)象如何通過sysfs表示整個(gè)設(shè)備驅(qū)動(dòng)模型為切入點(diǎn),進(jìn)一步理解Linux內(nèi)核對(duì)象。

      內(nèi)核對(duì)象添加到sysfs

      在上文《內(nèi)核對(duì)象與對(duì)象集》中,將kobject添加到sysfs中,kobject_add –> kobject_add_varg –> kobject_add_internal,調(diào)用create_dir創(chuàng)建sysfs目錄和屬性文件。

      static int create_dir(struct kobject *kobj)
      {
          const struct kobj_ns_type_operations *ops;
          int error;
              //調(diào)用sysfs接口創(chuàng)建kobject對(duì)應(yīng)的目錄
          error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));
          if (error)
              return error;
      
          error = populate_dir(kobj);  //在kobject對(duì)應(yīng)的目錄中生成默認(rèn)屬性文件
          if (error) {
              sysfs_remove_dir(kobj);
              return error;
          }
      
          /*
           * @kobj->sd may be deleted by an ancestor going away.  Hold an
           * extra reference so that it stays until @kobj is gone.
           */
          sysfs_get(kobj->sd);
      
          /*
           * If @kobj has ns_ops, its children need to be filtered based on
           * their namespace tags.  Enable namespace support on @kobj->sd.
           */
          ops = kobj_child_ns_ops(kobj);
          if (ops) {
              BUG_ON(ops->type <= KOBJ_NS_TYPE_NONE);
              BUG_ON(ops->type >= KOBJ_NS_TYPES);
              BUG_ON(!kobj_ns_type_registered(ops->type));
      
              sysfs_enable_ns(kobj->sd);
          }
      
          return 0;
      }
      
      /*
       * populate_dir - populate directory with attributes.
       * @kobj: object we're working on.
       *
       * Most subsystems have a set of default attributes that are associated
       * with an object that registers with them.  This is a helper called during
       * object registration that loops through the default attributes of the
       * subsystem and creates attributes files for them in sysfs.
       */
      static int populate_dir(struct kobject *kobj)
      {
          struct kobj_type *t = get_ktype(kobj);
          struct attribute *attr;
          int error = 0;
          int i;
      
          if (t && t->default_attrs) {
              for (i = 0; (attr = t->default_attrs[i]) != NULL; i++) {
                  error = sysfs_create_file(kobj, attr); //為每個(gè)屬性創(chuàng)建對(duì)應(yīng)的文件
                  if (error)
                      break;
              }
          }
          return error;
      }

      create_dir通過調(diào)用sysfs_create_dir_ns創(chuàng)建sysfs中的目錄,調(diào)用sysfs_create_file創(chuàng)建屬性文件。

      sysfs的核心結(jié)構(gòu)

      kern_node代表sysfs中每個(gè)節(jié)點(diǎn)。

      /*
       * kernfs_node - the building block of kernfs hierarchy.  Each and every
       * kernfs node is represented by single kernfs_node.  Most fields are
       * private to kernfs and shouldn't be accessed directly by kernfs users.
       *
       * As long as s_count reference is held, the kernfs_node itself is
       * accessible.  Dereferencing elem or any other outer entity requires
       * active reference.
       */
      struct kernfs_node {
          atomic_t        count;   //引用計(jì)數(shù)
          atomic_t        active;  //活動(dòng)的引用計(jì)數(shù)
      #ifdef CONFIG_DEBUG_LOCK_ALLOC
          struct lockdep_map  dep_map;
      #endif
          /*
           * Use kernfs_get_parent() and kernfs_name/path() instead of
           * accessing the following two fields directly.  If the node is
           * never moved to a different parent, it is safe to access the
           * parent directly.
           */
          struct kernfs_node  *parent; //指向父節(jié)點(diǎn)
          const char      *name;       //節(jié)點(diǎn)名稱,在sysfs顯示的名字
      
          struct rb_node      rb;      //接入sysfs紅黑樹的鏈接項(xiàng)
      
          const void      *ns;    /* namespace tag */
          unsigned int        hash;   /* ns + name hash 紅黑樹key */
          union {
              struct kernfs_elem_dir      dir;     //該kern_node類型為目錄
              struct kernfs_elem_symlink  symlink; //該kern_node類型為鏈接
              struct kernfs_elem_attr     attr;    //該kern_node類型為屬性文件
          };
      
          void            *priv;
      
          unsigned short      flags; //標(biāo)記位,目錄、鏈接、屬性文件或是否已被刪除
          umode_t         mode;      //訪問權(quán)限,在sysfs中該kern_node的權(quán)限
          unsigned int        ino;   //唯一編號(hào)
          struct kernfs_iattrs    *iattr;  //用于設(shè)置非默認(rèn)的inode屬性,如果沒有則置為NULL
      };

      在sysfs中創(chuàng)建目錄sysfs_create_dir_ns

      /**
       * sysfs_create_dir_ns - create a directory for an object with a namespace tag
       * @kobj: object we're creating directory for
       * @ns: the namespace tag to use
       */
      int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
      {
          struct kernfs_node *parent, *kn;
      
          BUG_ON(!kobj);
      
          if (kobj->parent)
              parent = kobj->parent->sd; //如果kobject設(shè)置parent,則使用之
          else
              parent = sysfs_root_kn;  //否則parent就設(shè)置為sysfs根目錄
      
          if (!parent)
              return -ENOENT;
          //創(chuàng)建目錄
          kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
                        S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);
          if (IS_ERR(kn)) {
              if (PTR_ERR(kn) == -EEXIST)
                  sysfs_warn_dup(parent, kobject_name(kobj));
              return PTR_ERR(kn);
          }
      
          kobj->sd = kn;
          return 0;
      }
      
      /**
       * kernfs_create_dir_ns - create a directory
       * @parent: parent in which to create a new directory
       * @name: name of the new directory
       * @mode: mode of the new directory
       * @priv: opaque data associated with the new directory
       * @ns: optional namespace tag of the directory
       *
       * Returns the created node on success, ERR_PTR() value on failure.
       */
      struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
                           const char *name, umode_t mode,
                           void *priv, const void *ns)
      {
          struct kernfs_node *kn;
          int rc;
      
          /* allocate 分配空間并初始化, KERNFS_DIR指定創(chuàng)建目錄 */
          kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
          if (!kn)
              return ERR_PTR(-ENOMEM);
      
          kn->dir.root = parent->dir.root; //指向根目錄kern_node
          kn->ns = ns;  //指定命名空間
          kn->priv = priv;
      
          /* link in */
          rc = kernfs_add_one(kn); //將kern_node加入父目錄的紅黑樹中
          if (!rc)
              return kn;
      
          kernfs_put(kn);
          return ERR_PTR(rc);
      }

      kernfs_create_dir_ns函數(shù)中的兩個(gè)主要函數(shù)kernfs_new_node和kernfs_add_one,在創(chuàng)建文件和創(chuàng)建符號(hào)鏈接同樣使用,僅是參數(shù)不同。

      為kern_node結(jié)構(gòu)分配空間,并初始化

      struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
                          const char *name, umode_t mode,
                          unsigned flags)
      {
          struct kernfs_node *kn;
          //分配kern_node空間,并初始化
          kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
          if (kn) {
              kernfs_get(parent);
              kn->parent = parent;
          }
          return kn;
      }
      
      static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
                               const char *name, umode_t mode,
                               unsigned flags)
      {
          struct kernfs_node *kn;
          int ret;
      
          name = kstrdup_const(name, GFP_KERNEL); //復(fù)制常量字符串
          if (!name)
              return NULL;
      
          kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); //在緩存空間分配kernfs_node
          if (!kn)
              goto err_out1;
      
          /*
           * If the ino of the sysfs entry created for a kmem cache gets
           * allocated from an ida layer, which is accounted to the memcg that
           * owns the cache, the memcg will get pinned forever. So do not account
           * ino ida allocations.
           */
          ret = ida_simple_get(&root->ino_ida, 1, 0,  //獲取唯一標(biāo)號(hào),用于唯一標(biāo)示kern_node
                       GFP_KERNEL | __GFP_NOACCOUNT);
          if (ret < 0)
              goto err_out2;
          kn->ino = ret;
      
          atomic_set(&kn->count, 1);  //更新引用計(jì)數(shù)
          atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
          RB_CLEAR_NODE(&kn->rb);
          //設(shè)置kern_node相關(guān)域
          kn->name = name;
          kn->mode = mode;
          kn->flags = flags;
      
          return kn;
      
       err_out2:
          kmem_cache_free(kernfs_node_cache, kn);
       err_out1:
          kfree_const(name);
          return NULL;
      }

      將kern_node添加到parent的紅黑樹中:

      /**
       *  kernfs_add_one - add kernfs_node to parent without warning
       *  @kn: kernfs_node to be added
       *
       *  The caller must already have initialized @kn->parent.  This
       *  function increments nlink of the parent's inode if @kn is a
       *  directory and link into the children list of the parent.
       *
       *  RETURNS:
       *  0 on success, -EEXIST if entry with the given name already
       *  exists.
       */
      int kernfs_add_one(struct kernfs_node *kn)
      {
          struct kernfs_node *parent = kn->parent;
          struct kernfs_iattrs *ps_iattr;
          bool has_ns;
          int ret;
      
          mutex_lock(&kernfs_mutex);
      
          ret = -EINVAL;
          has_ns = kernfs_ns_enabled(parent);
          if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
               has_ns ? "required" : "invalid", parent->name, kn->name))
              goto out_unlock;
      
          if (kernfs_type(parent) != KERNFS_DIR) //檢查parent是否為目錄
              goto out_unlock;
      
          ret = -ENOENT;
          if (parent->flags & KERNFS_EMPTY_DIR)  //檢查parent是否為空目錄
              goto out_unlock;
          //檢查parent是否是active狀態(tài)
          if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
              goto out_unlock;
      
          kn->hash = kernfs_name_hash(kn->name, kn->ns); //作為紅黑樹比較的key
      
          ret = kernfs_link_sibling(kn); //kern_node鏈入parent節(jié)點(diǎn)紅黑樹中
          if (ret)
              goto out_unlock;
      
          /* Update timestamps on the parent */
          ps_iattr = parent->iattr;
          if (ps_iattr) {
              struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
              ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
          }
      
          mutex_unlock(&kernfs_mutex);
      
          /*
           * Activate the new node unless CREATE_DEACTIVATED is requested.
           * If not activated here, the kernfs user is responsible for
           * activating the node with kernfs_activate().  A node which hasn't
           * been activated is not visible to userland and its removal won't
           * trigger deactivation.
           */
          if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
              kernfs_activate(kn);
          return 0;
      
      out_unlock:
          mutex_unlock(&kernfs_mutex);
          return ret;
      }

      sysfs紅黑樹中的key:

      /**
       *  kernfs_name_hash
       *  @name: Null terminated string to hash
       *  @ns:   Namespace tag to hash
       *
       *  Returns 31 bit hash of ns + name (so it fits in an off_t )
       */
      static unsigned int kernfs_name_hash(const char *name, const void *ns)
      {
          unsigned long hash = init_name_hash();
          unsigned int len = strlen(name);
          while (len--)
              hash = partial_name_hash(*name++, hash);
          hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
          hash &= 0x7fffffffU;
          /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
          if (hash < 2)
              hash += 2;
          if (hash >= INT_MAX)
              hash = INT_MAX - 1;
          return hash;
      }
      
      static int kernfs_name_compare(unsigned int hash, const char *name,
                         const void *ns, const struct kernfs_node *kn)
      {
          if (hash < kn->hash)
              return -1;
          if (hash > kn->hash)
              return 1;
          if (ns < kn->ns)
              return -1;
          if (ns > kn->ns)
              return 1;
          return strcmp(name, kn->name);
      }
      • kernfs_name_hash: 根據(jù)name和ns計(jì)算kern_node的hash值,保存在kern_node.hash域中。

      • kernfs_name_compare: sysfs紅黑樹key的比較函數(shù), 比較優(yōu)先級(jí)是: hash > ns > name

      kern_node鏈入parent節(jié)點(diǎn)紅黑樹中:

      /**
       *  kernfs_link_sibling - link kernfs_node into sibling rbtree
       *  @kn: kernfs_node of interest
       *
       *  Link @kn into its sibling rbtree which starts from
       *  @kn->parent->dir.children.
       *
       *  Locking:
       *  mutex_lock(kernfs_mutex)
       *
       *  RETURNS:
       *  0 on susccess -EEXIST on failure.
       */
      static int kernfs_link_sibling(struct kernfs_node *kn)
      {
          struct rb_node **node = &kn->parent->dir.children.rb_node; //parent目錄的紅黑樹
          struct rb_node *parent = NULL;
      
          while (*node) {  //在parent的目錄中,尋找合適的位置將kn插入parent的紅黑樹中
              struct kernfs_node *pos;
              int result;
      
              pos = rb_to_kn(*node);
              parent = *node;
              result = kernfs_sd_compare(kn, pos); //優(yōu)先順序: hash > ns > name
              if (result < 0)
                  node = &pos->rb.rb_left;
              else if (result > 0)
                  node = &pos->rb.rb_right;
              else
                  return -EEXIST;
          }
      
          /* add new node and rebalance the tree */
          rb_link_node(&kn->rb, parent, node);
          rb_insert_color(&kn->rb, &kn->parent->dir.children);
      
          /* successfully added, account subdir number */
          if (kernfs_type(kn) == KERNFS_DIR)
              kn->parent->dir.subdirs++;
      
          return 0;
      }

      在sysfs中創(chuàng)建文件

      static inline int __must_check sysfs_create_file(struct kobject *kobj,
                               const struct attribute *attr)
      {
          return sysfs_create_file_ns(kobj, attr, NULL);
      }
      
      /**
       * sysfs_create_file_ns - create an attribute file for an object with custom ns
       * @kobj: object we're creating for
       * @attr: attribute descriptor
       * @ns: namespace the new file should belong to
       */
      int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
                   const void *ns)
      {
          BUG_ON(!kobj || !kobj->sd || !attr);
      
          return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);
      
      }
      EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
      
      int sysfs_add_file_mode_ns(struct kernfs_node *parent,
                     const struct attribute *attr, bool is_bin,
                     umode_t mode, const void *ns)
      {
          struct lock_class_key *key = NULL;
          const struct kernfs_ops *ops;
          struct kernfs_node *kn;
          loff_t size;
      
          if (!is_bin) {
              struct kobject *kobj = parent->priv;
              const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
      
              /* every kobject with an attribute needs a ktype assigned */
              if (WARN(!sysfs_ops, KERN_ERR
                   "missing sysfs attribute operations for kobject: %s\n",
                   kobject_name(kobj)))
                  return -EINVAL;
              //確定讀寫的操作函數(shù)
              if (sysfs_ops->show && sysfs_ops->store) {
                  if (mode & SYSFS_PREALLOC)
                      ops = &sysfs_prealloc_kfops_rw;
                  else
                      ops = &sysfs_file_kfops_rw;
              } else if (sysfs_ops->show) {
                  if (mode & SYSFS_PREALLOC)
                      ops = &sysfs_prealloc_kfops_ro;
                  else
                      ops = &sysfs_file_kfops_ro;
              } else if (sysfs_ops->store) {
                  if (mode & SYSFS_PREALLOC)
                      ops = &sysfs_prealloc_kfops_wo;
                  else
                      ops = &sysfs_file_kfops_wo;
              } else
                  ops = &sysfs_file_kfops_empty;
      
              size = PAGE_SIZE;
          } else {
              struct bin_attribute *battr = (void *)attr;
      
              if (battr->mmap)
                  ops = &sysfs_bin_kfops_mmap;
              else if (battr->read && battr->write)
                  ops = &sysfs_bin_kfops_rw;
              else if (battr->read)
                  ops = &sysfs_bin_kfops_ro;
              else if (battr->write)
                  ops = &sysfs_bin_kfops_wo;
              else
                  ops = &sysfs_file_kfops_empty;
      
              size = battr->size;
          }
      
      #ifdef CONFIG_DEBUG_LOCK_ALLOC
          if (!attr->ignore_lockdep)
              key = attr->key ?: (struct lock_class_key *)&attr->skey;
      #endif
          kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
                        (void *)attr, ns, key); //創(chuàng)建屬性文件
          if (IS_ERR(kn)) {
              if (PTR_ERR(kn) == -EEXIST)
                  sysfs_warn_dup(parent, attr->name);
              return PTR_ERR(kn);
          }
          return 0;
      }

      通過上面的代碼跟蹤,創(chuàng)建屬性文件由__kernfs_create_file實(shí)現(xiàn),最終仍然是調(diào)用kernfs_new_node和kernfs_add_one。

      /**
       * __kernfs_create_file - kernfs internal function to create a file
       * @parent: directory to create the file in
       * @name: name of the file
       * @mode: mode of the file
       * @size: size of the file
       * @ops: kernfs operations for the file
       * @priv: private data for the file
       * @ns: optional namespace tag of the file
       * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
       *
       * Returns the created node on success, ERR_PTR() value on error.
       */
      struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                           const char *name,
                           umode_t mode, loff_t size,
                           const struct kernfs_ops *ops,
                           void *priv, const void *ns,
                           struct lock_class_key *key)
      {
          struct kernfs_node *kn;
          unsigned flags;
          int rc;
      
          flags = KERNFS_FILE; //創(chuàng)建的kern_node類型為file
          //分配空間并初始化
          kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
          if (!kn)
              return ERR_PTR(-ENOMEM);
      
          kn->attr.ops = ops;
          kn->attr.size = size;
          kn->ns = ns;
          kn->priv = priv;
      
      #ifdef CONFIG_DEBUG_LOCK_ALLOC
          if (key) {
              lockdep_init_map(&kn->dep_map, "s_active", key, 0);
              kn->flags |= KERNFS_LOCKDEP;
          }
      #endif
      
          /*
           * kn->attr.ops is accesible only while holding active ref.  We
           * need to know whether some ops are implemented outside active
           * ref.  Cache their existence in flags.
           */
          if (ops->seq_show)
              kn->flags |= KERNFS_HAS_SEQ_SHOW;
          if (ops->mmap)
              kn->flags |= KERNFS_HAS_MMAP;
      
          rc = kernfs_add_one(kn); //將kern_node添加到parent的紅黑樹中
          if (rc) {
              kernfs_put(kn);
              return ERR_PTR(rc);
          }
          return kn;
      }

      在sysfs_add_file_mode_ns函數(shù)中根據(jù)flags的不同,注冊(cè)不同的讀寫回調(diào)函數(shù),下面以sysfs_prealloc_kfops_rw函數(shù)為例,其他結(jié)構(gòu)類似,不贅述。

      //常規(guī)文件--sysfs_prealloc_kfops_rw
      static const struct kernfs_ops sysfs_prealloc_kfops_rw = {
          .read       = sysfs_kf_read,
          .write      = sysfs_kf_write,
          .prealloc   = true,
      };
      
      /* kernfs read callback for regular sysfs files with pre-alloc */
      static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
                       size_t count, loff_t pos)
      {
          const struct sysfs_ops *ops = sysfs_file_ops(of->kn); //獲取kobject中的sysfs_ops操作表
          struct kobject *kobj = of->kn->parent->priv;
          size_t len;
      
          /*
           * If buf != of->prealloc_buf, we don't know how
           * large it is, so cannot safely pass it to ->show
           */
          if (pos || WARN_ON_ONCE(buf != of->prealloc_buf))
              return 0;
          len = ops->show(kobj, of->kn->priv, buf); //kobject中sd域的sysfs_ops操作表中的show
          return min(count, len);
      }
      
      /* kernfs write callback for regular sysfs files */
      static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
                        size_t count, loff_t pos)
      {   //獲取kobject中的sysfs_ops操作表
          const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
          struct kobject *kobj = of->kn->parent->priv;
      
          if (!count)
              return 0;
      
          return ops->store(kobj, of->kn->priv, buf, count); //kobject中sd域的sysfs_ops操作表中的store
      }

      關(guān)于屬性文件的讀寫操作,最終都回調(diào)到kobject中的sd域的sysfs_ops操作表,這個(gè)操作表示在kobject_init函數(shù)中設(shè)置。回顧kobject_create函數(shù):

      struct kobject *kobject_create(void)
      {
          struct kobject *kobj;
      
          kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); //分配空間
          if (!kobj)
              return NULL;
      
          kobject_init(kobj, &dynamic_kobj_ktype);  //初始化, kobj_type類型為dynamic_kobj_ktype
          return kobj;
      }
      
      //注冊(cè)如下結(jié)構(gòu)
      static struct kobj_type dynamic_kobj_ktype = {
          .release    = dynamic_kobj_release,
          .sysfs_ops  = &kobj_sysfs_ops,
      };
      
      const struct sysfs_ops kobj_sysfs_ops = {
          .show   = kobj_attr_show,
          .store  = kobj_attr_store,
      };
      EXPORT_SYMBOL_GPL(kobj_sysfs_ops);

      kobject的sysfs的show和store方法為:kobj_attr_show和kobj_attr_store

      static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr,
                        char *buf)
      {
          struct kobj_attribute *kattr;
          ssize_t ret = -EIO;
      
          kattr = container_of(attr, struct kobj_attribute, attr);
          if (kattr->show)  //如果業(yè)務(wù)子系統(tǒng)設(shè)置了show函數(shù),則調(diào)用
              ret = kattr->show(kobj, kattr, buf);
          return ret;
      }
      
      static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr,
                         const char *buf, size_t count)
      {
          struct kobj_attribute *kattr;
          ssize_t ret = -EIO;
      
          kattr = container_of(attr, struct kobj_attribute, attr);
          if (kattr->store)  //如果業(yè)務(wù)子系統(tǒng)設(shè)置了store函數(shù),則調(diào)用
              ret = kattr->store(kobj, kattr, buf, count);
          return ret;
      }

      真正的對(duì)屬性文件進(jìn)行讀寫的回調(diào)由業(yè)務(wù)子系統(tǒng)實(shí)現(xiàn)。

      在sysfs中創(chuàng)建符號(hào)鏈接

      /**
       *  sysfs_create_link - create symlink between two objects.
       *  @kobj:  object whose directory we're creating the link in.
       *  @target:    object we're pointing to.
       *  @name:      name of the symlink.
       */
      int sysfs_create_link(struct kobject *kobj, struct kobject *target,
                    const char *name)
      {
          return sysfs_do_create_link(kobj, target, name, 1);
      }
      EXPORT_SYMBOL_GPL(sysfs_create_link);
      
      static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
                      const char *name, int warn)
      {
          struct kernfs_node *parent = NULL;
      
          if (!kobj)
              parent = sysfs_root_kn;
          else
              parent = kobj->sd;
      
          if (!parent)
              return -EFAULT;
      
          return sysfs_do_create_link_sd(parent, target, name, warn);
      }
      
      static int sysfs_do_create_link_sd(struct kernfs_node *parent,
                         struct kobject *target_kobj,
                         const char *name, int warn)
      {
          struct kernfs_node *kn, *target = NULL;
      
          BUG_ON(!name || !parent);
      
          /*
           * We don't own @target_kobj and it may be removed at any time.
           * Synchronize using sysfs_symlink_target_lock.  See
           * sysfs_remove_dir() for details.
           */
          spin_lock(&sysfs_symlink_target_lock);
          if (target_kobj->sd) {
              target = target_kobj->sd;
              kernfs_get(target);
          }
          spin_unlock(&sysfs_symlink_target_lock);
      
          if (!target)
              return -ENOENT;
      
          kn = kernfs_create_link(parent, name, target); //創(chuàng)建sysfs符號(hào)鏈接
          kernfs_put(target);
      
          if (!IS_ERR(kn))
              return 0;
      
          if (warn && PTR_ERR(kn) == -EEXIST)
              sysfs_warn_dup(parent, name);
          return PTR_ERR(kn);
      }

      由上面的代碼追蹤,創(chuàng)建符號(hào)鏈接由kernfs_create_link函數(shù)上。

      /**
       * kernfs_create_link - create a symlink
       * @parent: directory to create the symlink in
       * @name: name of the symlink
       * @target: target node for the symlink to point to
       *
       * Returns the created node on success, ERR_PTR() value on error.
       */
      struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
                             const char *name,
                             struct kernfs_node *target)
      {
          struct kernfs_node *kn;
          int error;
          //指定創(chuàng)建符號(hào)鏈接
          kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
          if (!kn)
              return ERR_PTR(-ENOMEM);
      
          if (kernfs_ns_enabled(parent))
              kn->ns = target->ns;
          kn->symlink.target_kn = target;
          kernfs_get(target); /* ref owned by symlink */
      
          error = kernfs_add_one(kn); //將kern_node添加到parent的紅黑樹中
          if (!error)
              return kn;
      
          kernfs_put(kn);
          return ERR_PTR(error);
      }

      與創(chuàng)建目錄和文件類似,最終仍然是調(diào)用kernfs_new_node和kernfs_add_one實(shí)現(xiàn)。

      基于內(nèi)核對(duì)象編程套路

      目標(biāo):在sysfs中創(chuàng)建一個(gè)目錄/sys/kernel/storage/,在該目錄下,還創(chuàng)建了一個(gè)文件value。value可以寫入整型數(shù)據(jù),隨后可以讀出。 
      * 定義內(nèi)核對(duì)象

      struct storage_obj {
          struct kobject kobj;
          int val;  //用于保存寫入的數(shù)據(jù)
      };
      • 定義屬性類型

      struct storage_attribute {
          struct attribute *attr;
          ssize_t (*show)(struct kobject *, struct attribute *, char *);
          ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t);
      }
      • 聲明屬性 
        定義屬性的show和store方法,如下:

      //定義并初始化storage_attribute
      struct storage_attribute *sattr = &struct storage_attribute {
          .attr = {.name = "value", .mode = 0666},
          .show = storage_show,
          .store = storage_store,
      };
      • 實(shí)現(xiàn)sysfs操作

      ssize_t storage_show(struct kobject *kobj, struct attribute *attr, char *buf) 
      {
          struct storage *stor = container_of(kobj, struct storage_obj, kobj);
          stor->val = atoi(buf);
      }
      
      ssize_t storage_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t s) {
          struct storage *stor = container_of(kobj, struct storage_obj, kobj);
          memcpy(buf, s, itoa(stor->val));
      }
      • 定義內(nèi)核對(duì)象release方法 
        release方法設(shè)置在kobj_type結(jié)構(gòu)中

      void storage_release(struct kobject *kobj)
      {
          ......
      }
      • 聲明內(nèi)核對(duì)象類型

      struct storage_ktype {
          struct kobj_type *ktype;
      }
      • 封裝對(duì)象屬性添加和刪除方法 
        需要將value屬性添加到內(nèi)核對(duì)象,或者從內(nèi)核對(duì)象刪除,可以直接調(diào)用sysfs_create_file和sysfs_remove_file。但大多數(shù)情況下,會(huì)對(duì)這兩個(gè)方法做一層封裝:storage_create_file和storage_remove_file。

      int storage_create_file(struct storage_obj *sobj, const struct storage_attribute *attr)
      {
          int error = 0;
          if (sobj) {
              error = sysfs_create_file(&sobj->kobj, &attr->attr);
          }
          return error;
      }
      
      void storage_remove_file(struct storage_obj *sobj, const struct storage_attribute *attr)
      {
          if (sobj) {
              sysfs_remove_file(&sobj->kobj, &attr->attr);
          }
      }
      • 定義對(duì)象的創(chuàng)建和銷毀方法

      struct storage_obj * create_storage_obj() 
      {
          struct storage_obj *sobj = (struct storage_obj *)malloc(struct storage_obj);
          struct storage_ktype *stype = (struct storage_ktype *)malloc(struct storage_ktype);
          sobj->parent = kernel_kobj;
          kobject_init_and_add(&sobj->kobj, &stype->ktype);
      
          return sobj
      }
      
      void destroy_storage_obj(struct kobject *kobj) {
          struct storage_obj *sobj = container_of(kobj, struct storage_obj, kobj);
      
          kobject_del(kboj);
          free(sobj);
          free(stype);
      }
      • 實(shí)現(xiàn)模塊加載和卸載方法 
        加載時(shí)調(diào)用create_storage_obj, 卸載時(shí)調(diào)用destroy_storage_obj


      設(shè)備驅(qū)動(dòng)模型

      概述

      Linux的設(shè)備驅(qū)動(dòng)模型能夠帶來以下的優(yōu)點(diǎn): 
      * 使用統(tǒng)一機(jī)制來表達(dá)設(shè)備與驅(qū)動(dòng)之間的關(guān)系,規(guī)范設(shè)備驅(qū)動(dòng)的編寫,核心代碼復(fù)用。 
      * 將系統(tǒng)中的設(shè)備以樹結(jié)構(gòu)組織,并且通過sysfs將其呈現(xiàn)在用戶空間——包括所有的總線和內(nèi)部連接。 
      * 支持設(shè)備的熱拔插機(jī)制。 
      * 支持通用的電源管理機(jī)制,通過由葉子節(jié)點(diǎn)到根節(jié)點(diǎn)的方向遍歷設(shè)備樹,確保子設(shè)備在父設(shè)備之前斷電。

      內(nèi)核基于內(nèi)核對(duì)象和sysfs,通過抽象以下五種概念,實(shí)現(xiàn)了設(shè)備驅(qū)動(dòng)模型的框架,使得編寫子系統(tǒng)成為“八股文”。 
      1. bus_type: 總線類型,每個(gè)子系統(tǒng)有且只有一個(gè)總線類型,由bus_type和subsys_private兩個(gè)結(jié)構(gòu)共同描述。 
      2. device: 設(shè)備,描述掛在總線類型中的設(shè)備,由device和device_private兩個(gè)結(jié)構(gòu)共同描述。 
      3. driver: 驅(qū)動(dòng), 描述掛在總線類型中的驅(qū)動(dòng)模塊,由device_driver和driver_private兩個(gè)結(jié)構(gòu)共同描述。 
      4. class: 類,每個(gè)總線類型有且只有一個(gè)類,由class和subsys_private兩個(gè)結(jié)構(gòu)共同描述。 
      5. class_interface: 接口,每個(gè)類有多個(gè)接口,由class_interface結(jié)構(gòu)描述。

      在Linux內(nèi)核中,子系統(tǒng)是由bus_type, device, driver, class和class_interface之間的關(guān)系所描述,而設(shè)備驅(qū)動(dòng)模型正是這些關(guān)系的核心實(shí)現(xiàn),使得在編寫子系統(tǒng)程序時(shí),只要遵循設(shè)備模型的套路,便不需要關(guān)注于這些復(fù)雜的關(guān)系,只需實(shí)現(xiàn)自身的業(yè)務(wù)邏輯。

      每個(gè)子系統(tǒng)都有一個(gè)總線類型,總線類型擁有一個(gè)設(shè)備鏈表和一個(gè)驅(qū)動(dòng)鏈表,用于連接由該總線類型已發(fā)現(xiàn)的設(shè)備和已加載的驅(qū)動(dòng),設(shè)備發(fā)現(xiàn)和驅(qū)動(dòng)加載的順序是任意的。每個(gè)設(shè)備最多綁定到一個(gè)驅(qū)動(dòng),被綁定了驅(qū)動(dòng)的設(shè)備可以正常工作。除此之外,每個(gè)設(shè)備可以唯一屬于某個(gè)類,類中包含多個(gè)接口,接口的方法作用于設(shè)備,不管是先添加接口,還是先發(fā)現(xiàn)設(shè)備。

      總線類型

      總線類型的數(shù)據(jù)結(jié)構(gòu)

      struct bus_type {
          const char      *name;         //子系統(tǒng)名稱
          const char      *dev_name;     //供子系統(tǒng)生成設(shè)備名稱使用
          struct device       *dev_root;
          struct device_attribute *dev_attrs; /* use dev_groups instead */
          const struct attribute_group **bus_groups;  //總線類型使用的屬性組
          const struct attribute_group **dev_groups;  //設(shè)備使用的屬性組
          const struct attribute_group **drv_groups;  //驅(qū)動(dòng)使用的屬性組
      
          int (*match)(struct device *dev, struct device_driver *drv);    //檢測(cè)設(shè)備與驅(qū)動(dòng)是否可以綁定
          int (*uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前,設(shè)置bus特有的環(huán)境變量
          int (*probe)(struct device *dev);     //當(dāng)設(shè)備可以綁定到驅(qū)動(dòng)時(shí),對(duì)設(shè)備進(jìn)行初始化和執(zhí)行綁定
          int (*remove)(struct device *dev);    //當(dāng)設(shè)備從驅(qū)動(dòng)中解綁時(shí),回調(diào)
          void (*shutdown)(struct device *dev); //當(dāng)設(shè)備斷電時(shí),回調(diào)
      
          int (*online)(struct device *dev);    //當(dāng)設(shè)備上電時(shí),回調(diào)
          int (*offline)(struct device *dev);   //當(dāng)設(shè)備下電時(shí),回調(diào)
      
          int (*suspend)(struct device *dev, pm_message_t state); //當(dāng)設(shè)備進(jìn)入節(jié)能狀態(tài)時(shí),回調(diào)
          int (*resume)(struct device *dev);                      //當(dāng)設(shè)備恢復(fù)正常狀態(tài)時(shí),回調(diào)
      
          const struct dev_pm_ops *pm;  //電源管理相關(guān)
      
          const struct iommu_ops *iommu_ops;
      
          struct subsys_private *p;         //子系統(tǒng)私有類型
          struct lock_class_key lock_key;
      };
      
      struct subsys_private {
          struct kset subsys;          //總線kset,scsi子系統(tǒng)對(duì)應(yīng)/sys/bus/scsi
          struct kset *devices_kset;   //設(shè)備kset, scsi子系統(tǒng)對(duì)應(yīng)/sys/bus/scsi/devices
          struct list_head interfaces; //總線的接口鏈表
          struct mutex mutex;          
      
          struct kset *drivers_kset;   //驅(qū)動(dòng)kset, scsi子系統(tǒng)對(duì)應(yīng)/sys/bus/scsi/drivers
          struct klist klist_devices;  //總線的設(shè)備鏈表
          struct klist klist_drivers;  //總線的驅(qū)動(dòng)鏈表
          struct blocking_notifier_head bus_notifier; //子系統(tǒng)變化時(shí),需要通知的鏈表
          unsigned int drivers_autoprobe:1;  //是否允許設(shè)備或驅(qū)動(dòng)加載時(shí),自動(dòng)探測(cè)
          struct bus_type *bus;        //指向總線類型
      
          struct kset glue_dirs;
          struct class *class;         //指向總線類型的類
      };

      從上面的兩個(gè)結(jié)構(gòu)可以看到,bus_type包含的主要是實(shí)現(xiàn)子系統(tǒng)應(yīng)該具體關(guān)注的比如name,一組回調(diào)函數(shù)。而subsys_private結(jié)構(gòu)主要是設(shè)備驅(qū)動(dòng)模型中的關(guān)系的表達(dá),如字段subsys的類型是kset,描述該子系統(tǒng)在sysfs中的表達(dá);klist_devices和klist_drivers分別是設(shè)備鏈表和驅(qū)動(dòng)鏈表,用于管理總線類型的所有設(shè)備和驅(qū)動(dòng)。之后仍然會(huì)遇到xxx_private的結(jié)構(gòu),以這種方式命名的結(jié)構(gòu),都是給設(shè)備驅(qū)動(dòng)模型核心使用的,業(yè)務(wù)子系統(tǒng)無需也不能使用。

      總線類型注冊(cè)/反注冊(cè)

      實(shí)現(xiàn)子系統(tǒng)的第一步就是創(chuàng)建bus_type,并將其注冊(cè)到系統(tǒng),此時(shí)需要調(diào)用bus_register:

      /**
       * bus_register - register a driver-core subsystem
       * @bus: bus to register
       *
       * Once we have that, we register the bus with the kobject
       * infrastructure, then register the children subsystems it has:
       * the devices and drivers that belong to the subsystem.
       */
      int bus_register(struct bus_type *bus)
      {
          int retval;
          struct subsys_private *priv;
          struct lock_class_key *key = &bus->lock_key;
          //分配總線類型私有數(shù)據(jù)空間
          priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL);
          if (!priv)
              return -ENOMEM;
      
          priv->bus = bus; //關(guān)聯(lián)bus_type和subsys_private
          bus->p = priv;
      
          BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier);
          //設(shè)置總線類型名稱到kobject中,在sysfs中顯示
          retval = kobject_set_name(&priv->subsys.kobj, "%s", bus->name);
          if (retval)
              goto out;
      
          priv->subsys.kobj.kset = bus_kset;
          priv->subsys.kobj.ktype = &bus_ktype;
          priv->drivers_autoprobe = 1;    //開啟自動(dòng)探測(cè)
      
          retval = kset_register(&priv->subsys);  //將總線類型添加到設(shè)備模型中
          if (retval)
              goto out;
      
          retval = bus_create_file(bus, &bus_attr_uevent); //創(chuàng)建uevent屬性文件
          if (retval)
              goto bus_uevent_fail;
      
          priv->devices_kset = kset_create_and_add("devices", NULL,  //創(chuàng)建devices目錄
                               &priv->subsys.kobj);
          if (!priv->devices_kset) {
              retval = -ENOMEM;
              goto bus_devices_fail;
          }
      
          priv->drivers_kset = kset_create_and_add("drivers", NULL,  //創(chuàng)建drivers目錄
                               &priv->subsys.kobj);
          if (!priv->drivers_kset) {
              retval = -ENOMEM;
              goto bus_drivers_fail;
          }
          //初始化鏈表和鎖
          INIT_LIST_HEAD(&priv->interfaces);
          __mutex_init(&priv->mutex, "subsys mutex", key);
          klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put);
          klist_init(&priv->klist_drivers, NULL, NULL);
      
          retval = add_probe_files(bus); //在sysfs中添加探測(cè)文件drivers_autoprobe和drivers_probe
          if (retval)
              goto bus_probe_files_fail;
      
          retval = bus_add_groups(bus, bus->bus_groups); //添加總線類型的屬性文件
          if (retval)
              goto bus_groups_fail;
      
          pr_debug("bus: '%s': registered\n", bus->name);
          return 0;
          //失敗回滾操作
      bus_groups_fail:
          remove_probe_files(bus);
      bus_probe_files_fail:
          kset_unregister(bus->p->drivers_kset);
      bus_drivers_fail:
          kset_unregister(bus->p->devices_kset);
      bus_devices_fail:
          bus_remove_file(bus, &bus_attr_uevent);
      bus_uevent_fail:
          kset_unregister(&bus->p->subsys);
      out:
          kfree(bus->p);
          bus->p = NULL;
          return retval;
      }
      EXPORT_SYMBOL_GPL(bus_register);

      注冊(cè)總線類型后,便可以在系統(tǒng)看到:

      root@ubuntu16:~# ls /sys/bus/scsi -l
      total 0
      drwxr-xr-x 2 root root    0 Sep  5 16:01 devices
      drwxr-xr-x 4 root root    0 Sep  2 09:44 drivers
      -rw-r--r-- 1 root root 4096 Sep  5 11:29 drivers_autoprobe
      --w------- 1 root root 4096 Sep  5 11:29 drivers_probe
      --w------- 1 root root 4096 Sep  2 09:44 uevent
      root@ubuntu16:~#

      當(dāng)從系統(tǒng)中注銷子系統(tǒng)時(shí),需要調(diào)用bus_unregister,完成總線類型的反注冊(cè):

      /**
       * bus_unregister - remove a bus from the system
       * @bus: bus.
       *
       * Unregister the child subsystems and the bus itself.
       * Finally, we call bus_put() to release the refcount
       */
      void bus_unregister(struct bus_type *bus)
      {
          pr_debug("bus: '%s': unregistering\n", bus->name);
          if (bus->dev_root)
              device_unregister(bus->dev_root);     //刪除根設(shè)備
          bus_remove_groups(bus, bus->bus_groups);  //刪除總線的屬性文件
          remove_probe_files(bus);                  //刪除探測(cè)文件drivers_autoprobe和drivers_probe
          kset_unregister(bus->p->drivers_kset);    //刪除drivers目錄
          kset_unregister(bus->p->devices_kset);    //刪除devices目錄
          bus_remove_file(bus, &bus_attr_uevent);   //刪除uevent文件
          kset_unregister(&bus->p->subsys);         //刪除總線目錄
      }
      EXPORT_SYMBOL_GPL(bus_unregister);

      設(shè)備

      設(shè)備的數(shù)據(jù)結(jié)構(gòu)

      struct device {
          struct device       *parent;  //指向父設(shè)備,eg.HBA
      
          struct device_private   *p;   //設(shè)備私有指針
      
          struct kobject kobj;          //內(nèi)嵌kobject
          const char      *init_name; /* initial name of the device */
          const struct device_type *type;  //設(shè)備類型,抽象出來的域和方法
      
          struct mutex        mutex;  /* mutex to synchronize calls to its driver */
      
          struct bus_type *bus;       /* type of bus device is on; devive歸屬的bus */
          struct device_driver *driver;   /* which driver has allocated this device */
          void        *platform_data; /* Platform specific data, device core doesn't touch it */
          void        *driver_data;   /* Driver data, set and get with dev_set/get_drvdata */
          struct dev_pm_info  power;
          struct dev_pm_domain    *pm_domain;
      
      #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
          struct irq_domain   *msi_domain;
      #endif
      #ifdef CONFIG_PINCTRL
          struct dev_pin_info *pins;
      #endif
      #ifdef CONFIG_GENERIC_MSI_IRQ
          struct list_head    msi_list;
      #endif
      
      #ifdef CONFIG_NUMA
          int     numa_node;  /* NUMA node this device is close to */
      #endif
          u64     *dma_mask;  /* dma mask (if dma'able device) */
          u64     coherent_dma_mask;/* Like dma_mask, but for
                               alloc_coherent mappings as
                               not all hardware supports
                               64 bit addresses for consistent
                               allocations such descriptors. */
          unsigned long   dma_pfn_offset;
      
          struct device_dma_parameters *dma_parms;
      
          struct list_head    dma_pools;  /* dma pools (if dma'ble) */
      
          struct dma_coherent_mem *dma_mem; /* internal for coherent mem override */
      #ifdef CONFIG_DMA_CMA
          struct cma *cma_area;       /* contiguous memory area for dma allocations */
      #endif
          /* arch specific additions */
          struct dev_archdata archdata;
      
          struct device_node  *of_node; /* associated device tree node */
          struct fwnode_handle    *fwnode; /* firmware device node */
      
          dev_t           devt;   /* dev_t, creates the sysfs "dev"; 設(shè)備號(hào) */
          u32         id; /* device instance */
      
          spinlock_t      devres_lock;
          struct list_head    devres_head; //設(shè)備資源鏈表頭
      
          struct klist_node   knode_class; //鏈入類的設(shè)備鏈表
          struct class        *class;      //指向鏈入的類
          const struct attribute_group **groups;  /* optional groups 設(shè)備特有的屬性 */
      
          void    (*release)(struct device *dev);  //設(shè)備是否回調(diào)
          struct iommu_group  *iommu_group;
      
          bool            offline_disabled:1;
          bool            offline:1;
      };
      
      struct device_private {
          struct klist klist_children;     //子設(shè)備鏈表
          struct klist_node knode_parent;  //鏈入父設(shè)備的children鏈表
          struct klist_node knode_driver;  //鏈入驅(qū)動(dòng)的設(shè)備鏈表中
          struct klist_node knode_bus;     //鏈入總線的設(shè)備鏈表
          struct list_head deferred_probe; //鏈入延遲探測(cè)鏈表
          struct device *device;           //指向關(guān)聯(lián)的device
      };
      
      struct device_type {
          const char *name;  //設(shè)備類型的名稱
          const struct attribute_group **groups;  //設(shè)備的公有屬性組
          int (*uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前調(diào)用,用于設(shè)置事件環(huán)境變量
          char *(*devnode)(struct device *dev, umode_t *mode, //在創(chuàng)建設(shè)備時(shí),提供名字線索
                   kuid_t *uid, kgid_t *gid);
          void (*release)(struct device *dev);    //設(shè)備釋放時(shí)回調(diào)
      
          const struct dev_pm_ops *pm;
      };

      在設(shè)備驅(qū)動(dòng)模型中,device結(jié)構(gòu)有bus域,指向device所屬的總線類型;class域指向device所屬的唯一的類;driver域指向設(shè)備所綁定的驅(qū)動(dòng)。與內(nèi)核對(duì)象一樣,設(shè)備也被組織層層次結(jié)構(gòu),通過parent指向父設(shè)備。

      device_private結(jié)構(gòu)由設(shè)備驅(qū)動(dòng)模型處理,維護(hù)和其他結(jié)構(gòu)之間的內(nèi)部關(guān)系。device_type結(jié)構(gòu)定義設(shè)備公有的屬性和方法。

      設(shè)備的注冊(cè)與反注冊(cè)

      當(dāng)設(shè)備被發(fā)現(xiàn)后,需要將設(shè)備注冊(cè)到系統(tǒng),需要調(diào)用device_register函數(shù):

      /**
       * device_register - register a device with the system.
       * @dev: pointer to the device structure
       *
       * This happens in two clean steps - initialize the device
       * and add it to the system. The two steps can be called
       * separately, but this is the easiest and most common.
       * I.e. you should only call the two helpers separately if
       * have a clearly defined need to use and refcount the device
       * before it is added to the hierarchy.
       *
       * For more information, see the kerneldoc for device_initialize()
       * and device_add().
       *
       * NOTE: _Never_ directly free @dev after calling this function, even
       * if it returned an error! Always use put_device() to give up the
       * reference initialized in this function instead.
       */
      int device_register(struct device *dev)
      {
          device_initialize(dev);  //初始化device結(jié)構(gòu)
          return device_add(dev);  //將設(shè)備添加到系統(tǒng)
      }
      EXPORT_SYMBOL_GPL(device_register);
      
      
      void device_initialize(struct device *dev)
      {
          dev->kobj.kset = devices_kset;             // /sys/devices/
          kobject_init(&dev->kobj, &device_ktype);   // device的類型為device_ktype
          INIT_LIST_HEAD(&dev->dma_pools);
          mutex_init(&dev->mutex);
          lockdep_set_novalidate_class(&dev->mutex);
          spin_lock_init(&dev->devres_lock);
          INIT_LIST_HEAD(&dev->devres_head);
          device_pm_init(dev);
          set_dev_node(dev, -1);
      #ifdef CONFIG_GENERIC_MSI_IRQ
          INIT_LIST_HEAD(&dev->msi_list);
      #endif
      }
      EXPORT_SYMBOL_GPL(device_initialize);

      device_register函數(shù)調(diào)用device_initialize對(duì)device結(jié)構(gòu)進(jìn)行初始化,調(diào)用device_add函數(shù)完成設(shè)備添加到系統(tǒng)。

      int device_add(struct device *dev)
      {
          struct device *parent = NULL;
          struct kobject *kobj;
          struct class_interface *class_intf;
          int error = -EINVAL;
      
          dev = get_device(dev);
          if (!dev)
              goto done;
      
          if (!dev->p) {  //如果device沒有設(shè)置devcie_private,在這里分配并初始化
              error = device_private_init(dev);
              if (error)
                  goto done;
          }
      
          /*
           * for statically allocated devices, which should all be converted
           * some day, we need to initialize the name. We prevent reading back
           * the name, and force the use of dev_name()
           */
          if (dev->init_name) {
              dev_set_name(dev, "%s", dev->init_name); //設(shè)置device的kobject名字
              dev->init_name = NULL;
          }
      
          /* subsystems can specify simple device enumeration */
          if (!dev_name(dev) && dev->bus && dev->bus->dev_name) //如果device沒有設(shè)置init_name, 則使用bus的dev_name和設(shè)備id生成
              dev_set_name(dev, "%s%u", dev->bus->dev_name, dev->id);
      
          if (!dev_name(dev)) {
              error = -EINVAL;
              goto name_error;
          }
      
          pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
      
          parent = get_device(dev->parent);
          kobj = get_device_parent(dev, parent);
          if (kobj)
              dev->kobj.parent = kobj;  //設(shè)置device的kobject的parent字段
      
          /* use parent numa_node */
          if (parent && (dev_to_node(dev) == NUMA_NO_NODE))
              set_dev_node(dev, dev_to_node(parent));
      
          /* first, register with generic layer. */
          /* we require the name to be set before, and pass NULL */
          error = kobject_add(&dev->kobj, dev->kobj.parent, NULL); //將device添加到parent的目錄中
          if (error)
              goto Error;
      
          /* notify platform of device entry */
          if (platform_notify)
              platform_notify(dev);
      
          error = device_create_file(dev, &dev_attr_uevent); //在設(shè)備目錄下創(chuàng)建uevent文件
          if (error)
              goto attrError;
      
          error = device_add_class_symlinks(dev); //為設(shè)備創(chuàng)建和類相關(guān)的符號(hào)鏈接
          if (error)
              goto SymlinkError;
          error = device_add_attrs(dev); //為設(shè)備的默認(rèn)屬性添加對(duì)應(yīng)的文件
          if (error)
              goto AttrsError;
          error = bus_add_device(dev);  //將device添加到bus_type
          if (error)
              goto BusError;
          error = dpm_sysfs_add(dev);
          if (error)
              goto DPMError;
          device_pm_add(dev);
      
          if (MAJOR(dev->devt)) {
              error = device_create_file(dev, &dev_attr_dev); //在設(shè)備目錄下創(chuàng)建dev屬性對(duì)應(yīng)文件,用于保存設(shè)備號(hào)
              if (error)
                  goto DevAttrError;
      
              error = device_create_sys_dev_entry(dev); //在/sys/block和/sys/char創(chuàng)建一個(gè)到設(shè)備所在目錄的符號(hào)鏈接
              if (error)
                  goto SysEntryError;
      
              devtmpfs_create_node(dev); //在/dev下創(chuàng)建設(shè)備文件
          }
      
          /* Notify clients of device addition.  This call must come
           * after dpm_sysfs_add() and before kobject_uevent().
           */
          if (dev->bus)
              blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
                               BUS_NOTIFY_ADD_DEVICE, dev);
      
          kobject_uevent(&dev->kobj, KOBJ_ADD); //發(fā)送設(shè)備ADD事件
          bus_probe_device(dev);  //嘗試將device綁定到device_driver
          if (parent)  //如果指定了parent,將設(shè)備添加到parent的孩子鏈表中
              klist_add_tail(&dev->p->knode_parent,
                         &parent->p->klist_children);
      
          if (dev->class) {  //如果設(shè)置了class,將設(shè)備添加到類的設(shè)備鏈表
              mutex_lock(&dev->class->p->mutex);
              /* tie the class to the device */
              klist_add_tail(&dev->knode_class,
                         &dev->class->p->klist_devices);
      
              /* notify any interfaces that the device is here */
              list_for_each_entry(class_intf,  //調(diào)用device所屬的class中所有class_interface的add_dev
                          &dev->class->p->interfaces, node)
                  if (class_intf->add_dev)
                      class_intf->add_dev(dev, class_intf);
              mutex_unlock(&dev->class->p->mutex);
          }
      done:
          put_device(dev);
          return error;
       SysEntryError:
          if (MAJOR(dev->devt))
              device_remove_file(dev, &dev_attr_dev);
       DevAttrError:
          device_pm_remove(dev);
          dpm_sysfs_remove(dev);
       DPMError:
          bus_remove_device(dev);
       BusError:
          device_remove_attrs(dev);
       AttrsError:
          device_remove_class_symlinks(dev);
       SymlinkError:
          device_remove_file(dev, &dev_attr_uevent);
       attrError:
          kobject_uevent(&dev->kobj, KOBJ_REMOVE);
          kobject_del(&dev->kobj);
       Error:
          cleanup_device_parent(dev);
          put_device(parent);
      name_error:
          kfree(dev->p);
          dev->p = NULL;
          goto done;
      }
      EXPORT_SYMBOL_GPL(device_add);

      設(shè)備添加到系統(tǒng)主要流程都在device_add函數(shù)實(shí)現(xiàn),上面代碼的注釋基本把主要函數(shù)的作用進(jìn)行了描述。值得關(guān)注的一個(gè)函數(shù)便是bus_probe_device,該函數(shù)完成將設(shè)備綁定到驅(qū)動(dòng)的動(dòng)作。

      void bus_probe_device(struct device *dev)
      {
          struct bus_type *bus = dev->bus;
          struct subsys_interface *sif;
      
          if (!bus)
              return;
      
          if (bus->p->drivers_autoprobe) //如果bus允許自動(dòng)探測(cè)
              device_initial_probe(dev); //主要功能
      
          mutex_lock(&bus->p->mutex);
          list_for_each_entry(sif, &bus->p->interfaces, node) //將設(shè)備綁定到接口
              if (sif->add_dev)
                  sif->add_dev(dev, sif);
          mutex_unlock(&bus->p->mutex);
      }
      
      void device_initial_probe(struct device *dev)
      {
          __device_attach(dev, true);
      }
      
      static int __device_attach(struct device *dev, bool allow_async)
      {
          int ret = 0;
      
          device_lock(dev);
          if (dev->driver) {  //指定了device所要綁定的driver
              if (klist_node_attached(&dev->p->knode_driver)) { //檢查knode_driver是否綁定到鏈表
                  ret = 1;
                  goto out_unlock;
              }
              ret = device_bind_driver(dev); //綁定,修改相應(yīng)鏈表
              if (ret == 0)
                  ret = 1;
              else {
                  dev->driver = NULL;
                  ret = 0;
              }
          } else {  //沒有指定device要綁定的driver
              struct device_attach_data data = {
                  .dev = dev,
                  .check_async = allow_async,
                  .want_async = false,
              };
      
              if (dev->parent)
                  pm_runtime_get_sync(dev->parent);
              //遍歷bus中所有驅(qū)動(dòng),嘗試attach
              ret = bus_for_each_drv(dev->bus, NULL, &data,
                          __device_attach_driver);
              if (!ret && allow_async && data.have_async) {
                  /*
                   * If we could not find appropriate driver
                   * synchronously and we are allowed to do
                   * async probes and there are drivers that
                   * want to probe asynchronously, we'll
                   * try them.
                   */
                  dev_dbg(dev, "scheduling asynchronous probe\n");
                  get_device(dev);
                  async_schedule(__device_attach_async_helper, dev);
              } else {
                  pm_request_idle(dev);
              }
      
              if (dev->parent)
                  pm_runtime_put(dev->parent);
          }
      out_unlock:
          device_unlock(dev);
          return ret;
      }

      通過上面3個(gè)函數(shù)的追蹤,__device_attach函數(shù)遍歷bus所有的驅(qū)動(dòng),嘗試執(zhí)行attach,具體調(diào)用__device_attach_driver函數(shù)。

      static int __device_attach_driver(struct device_driver *drv, void *_data)
      {
          struct device_attach_data *data = _data;
          struct device *dev = data->dev;
          bool async_allowed;
      
          /*
           * Check if device has already been claimed. This may
           * happen with driver loading, device discovery/registration,
           * and deferred probe processing happens all at once with
           * multiple threads.
           */
          if (dev->driver) 
              return -EBUSY;
      
          if (!driver_match_device(drv, dev))  //調(diào)用bus的match函數(shù),測(cè)試是否匹配
              return 0;
          //進(jìn)一步probe設(shè)備,需要設(shè)備已經(jīng)注冊(cè)
          async_allowed = driver_allows_async_probing(drv);
      
          if (async_allowed)
              data->have_async = true;
          //如果允許異步探測(cè),則先返回
          if (data->check_async && async_allowed != data->want_async)
              return 0;
      
          return driver_probe_device(drv, dev);
      }
      
      int driver_probe_device(struct device_driver *drv, struct device *dev)
      {
          int ret = 0;
      
          if (!device_is_registered(dev)) //檢查device是否register
              return -ENODEV;
      
          pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
               drv->bus->name, __func__, dev_name(dev), drv->name);
      
          if (dev->parent)
              pm_runtime_get_sync(dev->parent);
      
          pm_runtime_barrier(dev);
          ret = really_probe(dev, drv); //真正執(zhí)行探測(cè)
          pm_request_idle(dev);
      
          if (dev->parent)
              pm_runtime_put(dev->parent);
      
          return ret;
      }

      從上面兩個(gè)函數(shù)來看,真正執(zhí)行probe的函數(shù)是really_probe。

      //返回1表示成功,返回0表示中間步驟出現(xiàn)異常,已回滾所有操作。
      static int really_probe(struct device *dev, struct device_driver *drv)
      {
          int ret = 0;
          int local_trigger_count = atomic_read(&deferred_trigger_count);
      
          atomic_inc(&probe_count);
          pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
               drv->bus->name, __func__, drv->name, dev_name(dev));
          WARN_ON(!list_empty(&dev->devres_head));
      
          dev->driver = drv; //將設(shè)備的driver指向當(dāng)前驅(qū)動(dòng)
      
          /* If using pinctrl, bind pins now before probing */
          ret = pinctrl_bind_pins(dev);
          if (ret)
              goto probe_failed;
      
          if (driver_sysfs_add(dev)) {  //在sysfs驅(qū)動(dòng)目錄中創(chuàng)建指向設(shè)備的符號(hào)鏈接,同時(shí)在設(shè)備目錄中創(chuàng)建指向驅(qū)動(dòng)的符號(hào)鏈接
              printk(KERN_ERR "%s: driver_sysfs_add(%s) failed\n",
                  __func__, dev_name(dev));
              goto probe_failed;
          }
      
          if (dev->pm_domain && dev->pm_domain->activate) {
              ret = dev->pm_domain->activate(dev);
              if (ret)
                  goto probe_failed;
          }
      
          /*
           * Ensure devices are listed in devices_kset in correct order
           * It's important to move Dev to the end of devices_kset before
           * calling .probe, because it could be recursive and parent Dev
           * should always go first
           */
          devices_kset_move_last(dev);
      
          if (dev->bus->probe) {
              ret = dev->bus->probe(dev); //優(yōu)先調(diào)用bus_type中的probe方法
              if (ret)
                  goto probe_failed;
          } else if (drv->probe) {
              ret = drv->probe(dev);  //其次,調(diào)用driver中的probe方法
              if (ret)
                  goto probe_failed;
          }
      
          pinctrl_init_done(dev);
      
          if (dev->pm_domain && dev->pm_domain->sync)
              dev->pm_domain->sync(dev);
      
          driver_bound(dev); //將設(shè)備鏈入驅(qū)動(dòng)的設(shè)備鏈表
          ret = 1;
          pr_debug("bus: '%s': %s: bound device %s to driver %s\n",
               drv->bus->name, __func__, dev_name(dev), drv->name);
          goto done;
      
      probe_failed:  //探測(cè)失敗, 回滾操作
          devres_release_all(dev);
          driver_sysfs_remove(dev);
          dev->driver = NULL;
          dev_set_drvdata(dev, NULL);
          if (dev->pm_domain && dev->pm_domain->dismiss)
              dev->pm_domain->dismiss(dev);
      
          switch (ret) {
          case -EPROBE_DEFER:
              /* Driver requested deferred probing */
              dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name);
              driver_deferred_probe_add(dev);
              /* Did a trigger occur while probing? Need to re-trigger if yes */
              if (local_trigger_count != atomic_read(&deferred_trigger_count))
                  driver_deferred_probe_trigger();
              break;
          case -ENODEV:
          case -ENXIO:
              pr_debug("%s: probe of %s rejects match %d\n",
                   drv->name, dev_name(dev), ret);
              break;
          default:
              /* driver matched but the probe failed */
              printk(KERN_WARNING
                     "%s: probe of %s failed with error %d\n",
                     drv->name, dev_name(dev), ret);
          }
          /*
           * Ignore errors returned by ->probe so that the next driver can try
           * its luck.
           */
          ret = 0;
      done:
          atomic_dec(&probe_count);
          wake_up(&probe_waitqueue);
          return ret;
      }

      到此,設(shè)備添加到系統(tǒng)的主要流程便基本清楚,不再往下跟蹤。

      驅(qū)動(dòng)

      驅(qū)動(dòng)數(shù)據(jù)結(jié)構(gòu)

      struct device_driver {
          const char      *name;     //driver名稱
          struct bus_type     *bus;  //driver所屬的bus_type
      
          struct module       *owner;
          const char      *mod_name;  /* used for built-in modules */
      
          bool suppress_bind_attrs;   /* disables bind/unbind via sysfs */
          enum probe_type probe_type;
      
          const struct of_device_id   *of_match_table;
          const struct acpi_device_id *acpi_match_table;
      
          int (*probe) (struct device *dev);  //在device綁定到driver之前,對(duì)device進(jìn)行初始化
          int (*remove) (struct device *dev); //在device解綁到driver時(shí),回調(diào)
          void (*shutdown) (struct device *dev);
          int (*suspend) (struct device *dev, pm_message_t state);
          int (*resume) (struct device *dev);
          const struct attribute_group **groups; //driver的屬性
      
          const struct dev_pm_ops *pm; //電源相關(guān)
      
          struct driver_private *p;  //driver私有結(jié)構(gòu)
      };
      
      struct driver_private {
          struct kobject kobj;
          struct klist klist_devices;   //driver所支持的device鏈表
          struct klist_node knode_bus;  //鏈入bus_type的驅(qū)動(dòng)鏈表中
          struct module_kobject *mkobj;
          struct device_driver *driver;  //指向driver
      };

      device_driver結(jié)構(gòu)中,bus域指向驅(qū)動(dòng)所屬的總線類型,knode_bus域用于鏈入總線類型的驅(qū)動(dòng)鏈表。driver_private結(jié)構(gòu)中的klist_devices域用于鏈接所有綁定到本驅(qū)動(dòng)的設(shè)備。

      驅(qū)動(dòng)注冊(cè)與反注冊(cè)

      驅(qū)動(dòng)在加載時(shí),需要將其注冊(cè)到總線類型,調(diào)用driver_register實(shí)現(xiàn):

      int driver_register(struct device_driver *drv)
      {
          int ret;
          struct device_driver *other;
      
          BUG_ON(!drv->bus->p); //確保bus已經(jīng)注冊(cè)到驅(qū)動(dòng)模型中
          //如果bus_type和driver都實(shí)現(xiàn)了同一個(gè)回調(diào),優(yōu)先使用bus_type的回調(diào)函數(shù),打印告警信息
          if ((drv->bus->probe && drv->probe) ||
              (drv->bus->remove && drv->remove) ||
              (drv->bus->shutdown && drv->shutdown))
              printk(KERN_WARNING "Driver '%s' needs updating - please use "
                  "bus_type methods\n", drv->name);
      
          other = driver_find(drv->name, drv->bus); //根據(jù)名字查找驅(qū)動(dòng)
          if (other) {
              printk(KERN_ERR "Error: Driver '%s' is already registered, "
                  "aborting...\n", drv->name);
              return -EBUSY;
          }
      
          ret = bus_add_driver(drv); //將driver添加到bus
          if (ret)
              return ret;
          ret = driver_add_groups(drv, drv->groups); //創(chuàng)建driver屬性文件
          if (ret) {
              bus_remove_driver(drv);
              return ret;
          }
          kobject_uevent(&drv->p->kobj, KOBJ_ADD); //發(fā)送ADD事件到用戶空間
      
          return ret;
      }
      EXPORT_SYMBOL_GPL(driver_register);

      添加driver到bus_type,由bus_add_driver實(shí)現(xiàn):

      int bus_add_driver(struct device_driver *drv)
      {
          struct bus_type *bus;
          struct driver_private *priv;
          int error = 0;
      
          bus = bus_get(drv->bus);
          if (!bus)
              return -EINVAL;
      
          pr_debug("bus: '%s': add driver %s\n", bus->name, drv->name);
      
          priv = kzalloc(sizeof(*priv), GFP_KERNEL);  //分配driver_private結(jié)構(gòu)空間
          if (!priv) {
              error = -ENOMEM;
              goto out_put_bus;
          }
          klist_init(&priv->klist_devices, NULL, NULL); //初始化driver設(shè)備鏈表
          priv->driver = drv; //關(guān)聯(lián)device_driver和driver_private
          drv->p = priv;
          priv->kobj.kset = bus->p->drivers_kset; //driver_private中的kobj的kset域指向subsys中的drivers_kset
          error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL,  //添加driver到sysfs
                           "%s", drv->name);
          if (error)
              goto out_unregister;
      
          klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers); //添加driver到bus的驅(qū)動(dòng)鏈表中
          if (drv->bus->p->drivers_autoprobe) {  //自動(dòng)探測(cè)
              if (driver_allows_async_probing(drv)) {  //允許異步執(zhí)行probe
                  pr_debug("bus: '%s': probing driver %s asynchronously\n",
                      drv->bus->name, drv->name);
                  async_schedule(driver_attach_async, drv); //異步probe
              } else {
                  error = driver_attach(drv);  //同步probe
                  if (error)
                      goto out_unregister;
              }
          }
          module_add_driver(drv->owner, drv);  //驅(qū)動(dòng)實(shí)現(xiàn)的模塊
      
          error = driver_create_file(drv, &driver_attr_uevent);  //在driver中添加uevent屬性文件
          if (error) {
              printk(KERN_ERR "%s: uevent attr (%s) failed\n",
                  __func__, drv->name);
          }
          error = driver_add_groups(drv, bus->drv_groups);  //添加driver的屬性文件
          if (error) {
              /* How the hell do we get out of this pickle? Give up */
              printk(KERN_ERR "%s: driver_create_groups(%s) failed\n",
                  __func__, drv->name);
          }
      
          if (!drv->suppress_bind_attrs) {
              error = add_bind_files(drv);  //在driver目錄添加的bind和unbind兩個(gè)屬性文件
              if (error) {
                  /* Ditto */
                  printk(KERN_ERR "%s: add_bind_files(%s) failed\n",
                      __func__, drv->name);
              }
          }
      
          return 0;
      
      out_unregister:
          kobject_put(&priv->kobj);
          kfree(drv->p);
          drv->p = NULL;
      out_put_bus:
          bus_put(bus);
          return error;
      }

      bus_add_driver函數(shù)完成驅(qū)動(dòng)添加到總線類型,當(dāng)驅(qū)動(dòng)添加完成后,如果總線類型設(shè)置了允許自動(dòng)探測(cè)標(biāo)志drivers_autoprobe,便可以根據(jù)是否允許異步探測(cè)調(diào)用driver_attach_async或driver_attach,driver_attach_async也是調(diào)用driver_attach:

      int driver_attach(struct device_driver *drv)
      {
          return bus_for_each_dev(drv->bus, NULL, drv, __driver_attach);
      }
      EXPORT_SYMBOL_GPL(driver_attach);
      
      static int __driver_attach(struct device *dev, void *data)
      {
          struct device_driver *drv = data;
      
          /*
           * Lock device and try to bind to it. We drop the error
           * here and always return 0, because we need to keep trying
           * to bind to devices and some drivers will return an error
           * simply if it didn't support the device.
           *
           * driver_probe_device() will spit a warning if there
           * is an error.
           */
      
          if (!driver_match_device(drv, dev)) //調(diào)用bus_type.match
              return 0;
      
          if (dev->parent)    /* Needed for USB */
              device_lock(dev->parent);
          device_lock(dev);
          if (!dev->driver)
              driver_probe_device(drv, dev); //完成probe的主要函數(shù)
          device_unlock(dev);
          if (dev->parent)
              device_unlock(dev->parent);
      
          return 0;
      }
      
      int driver_probe_device(struct device_driver *drv, struct device *dev)
      {
          int ret = 0;
      
          if (!device_is_registered(dev)) //檢查device是否register
              return -ENODEV;
      
          pr_debug("bus: '%s': %s: matched device %s with driver %s\n",
               drv->bus->name, __func__, dev_name(dev), drv->name);
      
          if (dev->parent)
              pm_runtime_get_sync(dev->parent);
      
          pm_runtime_barrier(dev);
          ret = really_probe(dev, drv); //真正執(zhí)行探測(cè)
          pm_request_idle(dev);
      
          if (dev->parent)
              pm_runtime_put(dev->parent);
      
          return ret;
      }

      根據(jù)上面3個(gè)函數(shù),最終仍然是調(diào)用前面描述過的really_probe函數(shù)完成最后的探測(cè)。

      到這里驅(qū)動(dòng)注冊(cè)完成,結(jié)合之前的設(shè)備注冊(cè)流程,無論是驅(qū)動(dòng)注冊(cè)或是設(shè)備注冊(cè),只要總線類型設(shè)置了自動(dòng)探測(cè)標(biāo)志位,這兩個(gè)流程都會(huì)執(zhí)行探測(cè)。所以設(shè)備發(fā)現(xiàn)與驅(qū)動(dòng)的加載順序已經(jīng)不再重要,也是通過這種雙向探測(cè)方式,Linux內(nèi)核支持設(shè)備的熱拔插機(jī)制。

      驅(qū)動(dòng)卸載時(shí),需要調(diào)用driver_unregister函數(shù),使driver脫離總線類型:

      void driver_unregister(struct device_driver *drv)
      {
          if (!drv || !drv->p) {
              WARN(1, "Unexpected driver unregister!\n");
              return;
          }
          driver_remove_groups(drv, drv->groups); //刪除驅(qū)動(dòng)的屬性文件
          bus_remove_driver(drv);                 //從總線類型中移除驅(qū)動(dòng)
      }
      EXPORT_SYMBOL_GPL(driver_unregister);
      
      void bus_remove_driver(struct device_driver *drv)
      {
          if (!drv->bus)
              return;
      
          if (!drv->suppress_bind_attrs)
              remove_bind_files(drv);   //刪除驅(qū)動(dòng)目錄下bind和unbind文件
          driver_remove_groups(drv, drv->bus->drv_groups); //刪除總線類型的驅(qū)動(dòng)屬性文件
          driver_remove_file(drv, &driver_attr_uevent);    //刪除驅(qū)動(dòng)目錄下uevent文件
          klist_remove(&drv->p->knode_bus); //從總線類型的驅(qū)動(dòng)鏈表中移除驅(qū)動(dòng)
          pr_debug("bus: '%s': remove driver %s\n", drv->bus->name, drv->name);
          driver_detach(drv);  //驅(qū)動(dòng)與所有綁定的設(shè)備進(jìn)行解綁
          module_remove_driver(drv);  //驅(qū)動(dòng)實(shí)現(xiàn)的模塊
          kobject_put(&drv->p->kobj); //減少引用計(jì)數(shù)
          bus_put(drv->bus);
      }

      類數(shù)據(jù)結(jié)構(gòu)

      struct class {
          const char      *name;       //類名稱
          struct module       *owner;  //指向?qū)崿F(xiàn)這個(gè)類的模塊的指針
      
          struct class_attribute      *class_attrs;     //類公共屬性
          const struct attribute_group    **dev_groups; //歸屬與該類的設(shè)備的默認(rèn)屬性
          struct kobject          *dev_kobj;            //類鏈入sysfs的kobject
      
          int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env); //發(fā)送事件前,設(shè)置類的特定環(huán)境變量
          char *(*devnode)(struct device *dev, umode_t *mode); //創(chuàng)建設(shè)備時(shí),返回設(shè)備名稱
      
          void (*class_release)(struct class *class); //類釋放時(shí)回調(diào)
          void (*dev_release)(struct device *dev);    //設(shè)備釋放時(shí)回調(diào)
      
          int (*suspend)(struct device *dev, pm_message_t state); //設(shè)備進(jìn)入睡眠狀態(tài)時(shí),回調(diào)
          int (*resume)(struct device *dev);                      //設(shè)備被喚醒時(shí),回調(diào)
      
          const struct kobj_ns_type_operations *ns_type;  //sysfs支持命名空間
          const void *(*namespace)(struct device *dev);   //返回設(shè)備所在的命名空間
      
          const struct dev_pm_ops *pm;  //電源相關(guān)
      
          struct subsys_private *p;     //類所屬的子系統(tǒng)私有數(shù)據(jù)結(jié)構(gòu)
      };

      類的私有數(shù)據(jù)類型與總線類型的私有數(shù)據(jù)類型都是subsys_private,這里將不再重復(fù)描述。

      類注冊(cè)與反注冊(cè)

      子系統(tǒng)需要使用類時(shí),需要調(diào)用class_register函數(shù)向總線類型注冊(cè)類:

      #define class_register(class)           ({                          static struct lock_class_key __key;     __class_register(class, &__key);    })
      
      int __class_register(struct class *cls, struct lock_class_key *key)
      {
          struct subsys_private *cp;
          int error;
      
          pr_debug("device class '%s': registering\n", cls->name);
      
          cp = kzalloc(sizeof(*cp), GFP_KERNEL); //分配私有數(shù)據(jù)空間
          if (!cp)
              return -ENOMEM;
          klist_init(&cp->klist_devices, klist_class_dev_get, klist_class_dev_put); //初始化該class的device鏈表
          INIT_LIST_HEAD(&cp->interfaces);  //初始化接口鏈表
          kset_init(&cp->glue_dirs);
          __mutex_init(&cp->mutex, "subsys mutex", key);
          error = kobject_set_name(&cp->subsys.kobj, "%s", cls->name); //將在/sys/class/目錄下顯示該名稱
          if (error) {
              kfree(cp);
              return error;
          }
      
          /* set the default /sys/dev directory for devices of this class */
          if (!cls->dev_kobj)
              cls->dev_kobj = sysfs_dev_char_kobj;
      
      #if defined(CONFIG_BLOCK)
          /* let the block class directory show up in the root of sysfs */
          if (!sysfs_deprecated || cls != &block_class)
              cp->subsys.kobj.kset = class_kset;
      #else
          cp->subsys.kobj.kset = class_kset;  // 全局變量class_kset指的是 /sys/class/
      #endif
          cp->subsys.kobj.ktype = &class_ktype;
          cp->class = cls;  //class與subsys_private關(guān)聯(lián)
          cls->p = cp;
      
          error = kset_register(&cp->subsys);  //在/sys/class/目錄下創(chuàng)建該類對(duì)應(yīng)的目錄
          if (error) {
              kfree(cp);
              return error;
          }
          error = add_class_attrs(class_get(cls)); //在/sys/class/xxx/目錄下創(chuàng)建類屬性文件
          class_put(cls);
          return error;
      }
      EXPORT_SYMBOL_GPL(__class_register);

      類的注冊(cè)比較簡單,注釋已經(jīng)比較詳細(xì)。當(dāng)子系統(tǒng)需要卸載類時(shí),需要調(diào)用class_register函數(shù):

      void class_unregister(struct class *cls)
      {
          pr_debug("device class '%s': unregistering\n", cls->name);
          remove_class_attrs(cls);            //刪除/sys/class/xxx/目錄下的類屬性文件
          kset_unregister(&cls->p->subsys);   //刪除/sys/class/目錄
      }

      接口

      接口數(shù)據(jù)結(jié)構(gòu)

      struct class_interface {
          struct list_head    node;    //鏈入class中
          struct class        *class;  //指向所屬的class 
          //在接口被添加或者設(shè)備被添加到接口所在的類時(shí),從接口中添加或刪除設(shè)備
          int (*add_dev)      (struct device *, struct class_interface *);
          void (*remove_dev)  (struct device *, struct class_interface *);
      };

      接口注冊(cè)與反注冊(cè)

      向類中注冊(cè)接口,需要調(diào)用class_interface_register函數(shù)完成:

      int class_interface_register(struct class_interface *class_intf)
      {
          struct class *parent;
          struct class_dev_iter iter;
          struct device *dev;
      
          if (!class_intf || !class_intf->class)  //確保class和class_interface都存在
              return -ENODEV;
      
          parent = class_get(class_intf->class); //增加引用計(jì)數(shù),并返回接口所屬的class
          if (!parent)
              return -EINVAL;
      
          mutex_lock(&parent->p->mutex);
          list_add_tail(&class_intf->node, &parent->p->interfaces); //將class_interface添加到class的接口鏈表
          if (class_intf->add_dev) {  //如果接口設(shè)置了add_dev方法,對(duì)該class的所有device調(diào)用
              class_dev_iter_init(&iter, parent, NULL, NULL);
              while ((dev = class_dev_iter_next(&iter)))
                  class_intf->add_dev(dev, class_intf);  //接口方法作用于設(shè)備
              class_dev_iter_exit(&iter);
          }
          mutex_unlock(&parent->p->mutex);
      
          return 0;
      }

      從類中刪除接口,需要調(diào)用class_interface_unregister函數(shù)完成:

      void class_interface_unregister(struct class_interface *class_intf)
      {
          struct class *parent = class_intf->class;
          struct class_dev_iter iter;
          struct device *dev;
      
          if (!parent)
              return;
      
          mutex_lock(&parent->p->mutex);
          list_del_init(&class_intf->node); //將class_interface從class的接口鏈表中刪除
          if (class_intf->remove_dev) { //如果接口設(shè)置了remove_dev方法,對(duì)該class的所有device調(diào)用
              class_dev_iter_init(&iter, parent, NULL, NULL);
              while ((dev = class_dev_iter_next(&iter)))
                  class_intf->remove_dev(dev, class_intf);  //接口方法作用于設(shè)備
              class_dev_iter_exit(&iter);
          }
          mutex_unlock(&parent->p->mutex);
      
          class_put(parent);
      }

      基于設(shè)備驅(qū)動(dòng)模型實(shí)現(xiàn)子系統(tǒng)

      Linux設(shè)備驅(qū)動(dòng)模型已經(jīng)將每種對(duì)象的關(guān)系,sysfs的呈現(xiàn)方式已經(jīng)實(shí)現(xiàn)了。實(shí)現(xiàn)子系統(tǒng)只需要定義業(yè)務(wù)自身的總線類型, 設(shè)備, 驅(qū)動(dòng), 類, 接口分別”繼承”bus_type, device, driver, class, class_interface。并根據(jù)具體業(yè)務(wù)實(shí)現(xiàn)各個(gè)結(jié)構(gòu)規(guī)定的回調(diào)函數(shù)。最后調(diào)用上述的注冊(cè)函數(shù)添加到系統(tǒng),便完成子系統(tǒng)的開發(fā)。


      SCSI子系統(tǒng)之概述

      Linux SCSI子系統(tǒng)的分層架構(gòu):

      這里寫圖片描述

      • 低層:代表與SCSI的物理接口的實(shí)際驅(qū)動(dòng)器,例如各個(gè)廠商為其特定的主機(jī)適配器(Host Bus Adapter, HBA)開發(fā)的驅(qū)動(dòng),低層驅(qū)動(dòng)主要作用是發(fā)現(xiàn)連接到主機(jī)適配器的scsi設(shè)備,在內(nèi)存中構(gòu)建scsi子系統(tǒng)所需的數(shù)據(jù)結(jié)構(gòu),并提供消息傳遞接口,將scsi命令的接受與發(fā)送解釋為主機(jī)適配器的操作。

      • 高層: 代表各種scsi設(shè)備類型的驅(qū)動(dòng),如scsi磁盤驅(qū)動(dòng),scsi磁帶驅(qū)動(dòng),高層驅(qū)動(dòng)認(rèn)領(lǐng)低層驅(qū)動(dòng)發(fā)現(xiàn)的scsi設(shè)備,為這些設(shè)備分配名稱,將對(duì)設(shè)備的IO轉(zhuǎn)換為scsi命令,交由低層驅(qū)動(dòng)處理。

      • 中層:包含scsi棧的公共服務(wù)函數(shù)。高層和低層通過調(diào)用中層的函數(shù)完成其功能,而中層在執(zhí)行過程中,也需要調(diào)用高層和低層注冊(cè)的回調(diào)函數(shù)做一些個(gè)性化處理。

      Linux SCSI模型

      這里寫圖片描述

      Linux SCSI模型是內(nèi)核的抽象,主機(jī)適配器連接主機(jī)IO總線(如PCI總線)和存儲(chǔ)IO總線(如SCSI總線)。一臺(tái)計(jì)算機(jī)可以有多個(gè)主機(jī)適配器,而主機(jī)適配器可以控制一條或多條SCSI總線,一條總線可以有多個(gè)目標(biāo)節(jié)點(diǎn)與之相連,并且一個(gè)目標(biāo)節(jié)點(diǎn)可以有多個(gè)邏輯單元。

      在Linux SCSI子系統(tǒng)中,內(nèi)核中的目標(biāo)節(jié)點(diǎn)(target)對(duì)應(yīng)SCSI磁盤,SCSI磁盤中可以有多個(gè)邏輯單元,統(tǒng)一由磁盤控制器控制,這些邏輯單元才是真正作為IO終點(diǎn)的存儲(chǔ)設(shè)備,內(nèi)核用設(shè)備(device)對(duì)邏輯單元進(jìn)行抽象;內(nèi)核中的Host對(duì)應(yīng)主機(jī)適配器(物理的HBA/RAID卡,虛擬的iscsi target)

      內(nèi)核使用四元組 來唯一標(biāo)識(shí)一個(gè)scsi的邏輯單元,在sysfs中查看sda磁盤<2:0:0:0>顯示如下:

      root@ubuntu16:/home/comet/Costor/bin# ls /sys/bus/scsi/devices/2\:0\:0\:0/block/sda/
      alignment_offset  device             events_poll_msecs  integrity  removable  sda5    subsystem
      bdi               discard_alignment  ext_range          power      ro         size    trace
      capability        events             holders            queue      sda1       slaves  uevent
      dev               events_async       inflight           range      sda2       stat
      root@ubuntu16:/home/comet/Costor/bin# cat /sys/bus/scsi/devices/2\:0\:0\:0/block/sda/dev
      8:0
      root@ubuntu16:/home/comet/Costor/bin# ll /dev/sda
      brw-rw---- 1 root disk 8, 0 Sep 19 11:36 /dev/sda
      • host: 主機(jī)適配器的唯一編號(hào)。

      • channel: 主機(jī)適配器中scsi通道編號(hào),由主機(jī)適配器固件維護(hù)。

      • id: 目標(biāo)節(jié)點(diǎn)唯一標(biāo)識(shí)符。

      • lun: 目標(biāo)節(jié)點(diǎn)內(nèi)邏輯單元編號(hào)。

      SCSI命令

      SCSI 命令是在 Command Descriptor Block (CDB) 中定義的。CDB 包含了用來定義要執(zhí)行的特定操作的操作代碼,以及大量特定于操作的參數(shù)。

      命令用途
      Test unit ready查詢?cè)O(shè)備是否已經(jīng)準(zhǔn)備好進(jìn)行傳輸
      Inquiry請(qǐng)求設(shè)備基本信息
      Request sense請(qǐng)求之前命令的錯(cuò)誤信息
      Read capacity請(qǐng)求存儲(chǔ)容量信息
      Read從設(shè)備讀取數(shù)據(jù)
      Write向設(shè)備寫入數(shù)據(jù)
      Mode sense請(qǐng)求模式頁面(設(shè)備參數(shù))
      Mode select在模式頁面配置設(shè)備參數(shù)

      借助大約 60 種可用命令,SCSI 可適用于許多設(shè)備(包括隨機(jī)存取設(shè)備,比如磁盤和像磁帶這樣的順序存儲(chǔ)設(shè)備)。SCSI 也提供了專門的命令以訪問箱體服務(wù)(比如存儲(chǔ)箱體內(nèi)部當(dāng)前的傳感和溫度)。

      核心數(shù)據(jù)結(jié)構(gòu)

      主機(jī)適配器模板scsi_host_template

      主機(jī)適配器模板是相同型號(hào)主機(jī)適配器的公共內(nèi)容,包括請(qǐng)求隊(duì)列深度,SCSI命令處理回調(diào)函數(shù),錯(cuò)誤處理恢復(fù)函數(shù)。分配主機(jī)適配器結(jié)構(gòu)時(shí),需要使用主機(jī)適配器模板來賦值。在編寫SCSI低層驅(qū)動(dòng)時(shí),第一步便是定義模板scsi_host_template,之后才能有模板生成主機(jī)適配器。

      struct scsi_host_template {
          struct module *module;  //指向使用該模板實(shí)現(xiàn)的scsi_host,低層驅(qū)動(dòng)模塊。
          const char *name;       //主機(jī)適配器名稱
      
          int (* detect)(struct scsi_host_template *);
          int (* release)(struct Scsi_Host *);
      
          const char *(* info)(struct Scsi_Host *); //返回HBA相關(guān)信息,可選實(shí)現(xiàn)
      
          int (* ioctl)(struct scsi_device *dev, int cmd, void __user *arg); //用戶空間ioctl函數(shù)的實(shí)現(xiàn),可選實(shí)現(xiàn)
      
      
      #ifdef CONFIG_COMPAT
          //通過該函數(shù),支持32位系統(tǒng)的用戶態(tài)ioctl函數(shù)
          int (* compat_ioctl)(struct scsi_device *dev, int cmd, void __user *arg);
      #endif
      
          //將scsi命令放進(jìn)低層驅(qū)動(dòng)的隊(duì)列,由中間層調(diào)用,必須實(shí)現(xiàn)
          int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *);
      
          //以下5個(gè)函數(shù)是錯(cuò)誤處理回調(diào)函數(shù),由中間層按照嚴(yán)重程度調(diào)用
          int (* eh_abort_handler)(struct scsi_cmnd *);        //Abort
          int (* eh_device_reset_handler)(struct scsi_cmnd *); //Device Reset
          int (* eh_target_reset_handler)(struct scsi_cmnd *); //Target Reset
          int (* eh_bus_reset_handler)(struct scsi_cmnd *);    //Bus Reset
          int (* eh_host_reset_handler)(struct scsi_cmnd *);   //Host Reset
      
          //當(dāng)掃描到新磁盤時(shí)調(diào)用,中間層回調(diào)這個(gè)函數(shù)中可以分配和初始化低層驅(qū)動(dòng)所需要的結(jié)構(gòu)
          int (* slave_alloc)(struct scsi_device *)
      
      //在設(shè)備受到INQUIRY命令后,執(zhí)行相關(guān)的配置操作
          int (* slave_configure)(struct scsi_device *);
      
          //在scsi設(shè)備銷毀之前調(diào)用,中間層回調(diào)用于釋放slave_alloc分配的私有數(shù)據(jù)
          void (* slave_destroy)(struct scsi_device *);
      
          //當(dāng)發(fā)現(xiàn)新的target,中間層調(diào)用,用戶分配target私有數(shù)據(jù)
          int (* target_alloc)(struct scsi_target *);
      
          //在target被銷毀之前,中間層調(diào)用,低層驅(qū)動(dòng)實(shí)現(xiàn),用于釋放target_alloc分配的數(shù)據(jù)
          void (* target_destroy)(struct scsi_target *);
      
          //需要自定義掃描target邏輯時(shí),中間層循環(huán)檢查返回值,直到該函數(shù)返回1,表示掃描完成
          int (* scan_finished)(struct Scsi_Host *, unsigned long);
      
          //需要自定義掃描target邏輯時(shí),掃描開始前回調(diào)
          void (* scan_start)(struct Scsi_Host *);
      
          //改變主機(jī)適配器的隊(duì)列深度,返回設(shè)置的隊(duì)列深度
          int (* change_queue_depth)(struct scsi_device *, int);
      
          //返回磁盤的BIOS參數(shù),如size, device, list (heads, sectors, cylinders)
          int (* bios_param)(struct scsi_device *, struct block_device *,
                  sector_t, int []);
      
          void (*unlock_native_capacity)(struct scsi_device *);
      
          //在procfs中的讀寫操作回調(diào)
          int (*show_info)(struct seq_file *, struct Scsi_Host *);
          int (*write_info)(struct Scsi_Host *, char *, int);
      
          //中間層發(fā)現(xiàn)scsi命令超時(shí)回調(diào)
          enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);
      
          //通過sysfs屬性reset主機(jī)適配器時(shí),回調(diào)
          int (*host_reset)(struct Scsi_Host *shost, int reset_type);
      #define SCSI_ADAPTER_RESET  1
      #define SCSI_FIRMWARE_RESET 2
      
          const char *proc_name; //在proc文件系統(tǒng)的名稱
      
          struct proc_dir_entry *proc_dir;
      
          int can_queue; //主機(jī)適配器能同時(shí)接受的命令數(shù)
      
          int this_id;
      
          /*
           * This determines the degree to which the host adapter is capable
           * of scatter-gather.
           */  //聚散列表的參數(shù)
          unsigned short sg_tablesize;
          unsigned short sg_prot_tablesize;
      
          /*
           * Set this if the host adapter has limitations beside segment count.
           */ //單個(gè)scsi命令能夠訪問的扇區(qū)最大數(shù)量
          unsigned int max_sectors;
      
          /*
           * DMA scatter gather segment boundary limit. A segment crossing this
           * boundary will be split in two.
           */
          unsigned long dma_boundary; //DMA聚散段邊界值,超過該值將被切割成兩個(gè)
      
      #define SCSI_DEFAULT_MAX_SECTORS    1024
      
          short cmd_per_lun;
      
          /*
           * present contains counter indicating how many boards of this
           * type were found when we did the scan.
           */
          unsigned char present;
      
          /* If use block layer to manage tags, this is tag allocation policy */
          int tag_alloc_policy;
      
          /*
           * Track QUEUE_FULL events and reduce queue depth on demand.
           */
          unsigned track_queue_depth:1;
      
          /*
           * This specifies the mode that a LLD supports.
           */
          unsigned supported_mode:2; //低層驅(qū)動(dòng)支持的模式(initiator或target)
      
          /*
           * True if this host adapter uses unchecked DMA onto an ISA bus.
           */
          unsigned unchecked_isa_dma:1;
      
          unsigned use_clustering:1;
      
          /*
           * True for emulated SCSI host adapters (e.g. ATAPI).
           */
          unsigned emulated:1;
      
          /*
           * True if the low-level driver performs its own reset-settle delays.
           */
          unsigned skip_settle_delay:1;
      
          /* True if the controller does not support WRITE SAME */
          unsigned no_write_same:1;
      
          /*
           * True if asynchronous aborts are not supported
           */
          unsigned no_async_abort:1;
      
          /*
           * Countdown for host blocking with no commands outstanding.
           */
          unsigned int max_host_blocked; //主機(jī)適配器發(fā)送隊(duì)列的低閥值,允許累計(jì)多個(gè)命令同時(shí)派發(fā)
      
      #define SCSI_DEFAULT_HOST_BLOCKED   7
      
          /*
           * Pointer to the sysfs class properties for this host, NULL terminated.
           */
          struct device_attribute **shost_attrs; //主機(jī)適配器類屬性
      
          /*
           * Pointer to the SCSI device properties for this host, NULL terminated.
           */
          struct device_attribute **sdev_attrs;  //主機(jī)適配器設(shè)備屬性
      
          struct list_head legacy_hosts;
      
          u64 vendor_id;
      
          /*
           * Additional per-command data allocated for the driver.
           */  //scsi 命令緩沖池,scsi命令都是預(yù)先分配好的,保存在cmd_pool中
          unsigned int cmd_size;
          struct scsi_host_cmd_pool *cmd_pool;
      
          /* temporary flag to disable blk-mq I/O path */
          bool disable_blk_mq;  //禁用通用塊層多隊(duì)列模式標(biāo)志
      };

      主機(jī)適配器Scsi_Host

      Scsi_Host描述一個(gè)SCSI主機(jī)適配器,SCSI主機(jī)適配器通常是一塊基于PCI總線的擴(kuò)展卡或是一個(gè)SCSI控制器芯片。每個(gè)SCSI主機(jī)適配器可以存在多個(gè)通道,一個(gè)通道實(shí)際擴(kuò)展了一條SCSI總線。每個(gè)通過可以連接多個(gè)SCSI目標(biāo)節(jié)點(diǎn),具體連接數(shù)量與SCSI總線帶載能力有關(guān),或者受具體SCSI協(xié)議的限制。 真實(shí)的主機(jī)總線適配器是接入主機(jī)IO總線上(通常是PCI總線),在系統(tǒng)啟動(dòng)時(shí),會(huì)掃描掛載在PCI總線上的設(shè)備,此時(shí)會(huì)分配主機(jī)總線適配器。 
      Scsi_Host結(jié)構(gòu)包含內(nèi)嵌通用設(shè)備,將被鏈入SCSI總線類型(scsi_bus_type)的設(shè)備鏈表。

      struct Scsi_Host {
          struct list_head    __devices; //設(shè)備鏈表
          struct list_head    __targets; //目標(biāo)節(jié)點(diǎn)鏈表
      
          struct scsi_host_cmd_pool *cmd_pool; //scsi命令緩沖池
          spinlock_t      free_list_lock;   //保護(hù)free_list
          struct list_head    free_list; /* backup store of cmd structs, scsi命令預(yù)先分配的備用命令鏈表 */
          struct list_head    starved_list; //scsi命令的饑餓鏈表
      
          spinlock_t      default_lock;
          spinlock_t      *host_lock;
      
          struct mutex        scan_mutex;/* serialize scanning activity */
      
          struct list_head    eh_cmd_q; //執(zhí)行錯(cuò)誤的scsi命令的鏈表
          struct task_struct    * ehandler;  /* Error recovery thread. 錯(cuò)誤恢復(fù)線程 */
          struct completion     * eh_action; /* Wait for specific actions on the
                                host. */
          wait_queue_head_t       host_wait; //scsi設(shè)備恢復(fù)等待隊(duì)列
          struct scsi_host_template *hostt;  //主機(jī)適配器模板
          struct scsi_transport_template *transportt; //指向SCSI傳輸層模板
      
          /*
           * Area to keep a shared tag map (if needed, will be
           * NULL if not).
           */
          union {
              struct blk_queue_tag    *bqt;
              struct blk_mq_tag_set   tag_set; //SCSI支持多隊(duì)列時(shí)使用
          };
          //已經(jīng)派發(fā)給主機(jī)適配器(低層驅(qū)動(dòng))的scsi命令數(shù)
          atomic_t host_busy;        /* commands actually active on low-level */
          atomic_t host_blocked;  //阻塞的scsi命令數(shù)
      
          unsigned int host_failed;      /* commands that failed.
                                protected by host_lock */
          unsigned int host_eh_scheduled;    /* EH scheduled without command */
      
          unsigned int host_no;  /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. 系統(tǒng)內(nèi)唯一標(biāo)識(shí) */
      
          /* next two fields are used to bound the time spent in error handling */
          int eh_deadline;
          unsigned long last_reset; //記錄上次reset時(shí)間
      
      
          /*
           * These three parameters can be used to allow for wide scsi,
           * and for host adapters that support multiple busses
           * The last two should be set to 1 more than the actual max id
           * or lun (e.g. 8 for SCSI parallel systems).
           */
          unsigned int max_channel; //主機(jī)適配器的最大通道編號(hào)
          unsigned int max_id;      //主機(jī)適配器目標(biāo)節(jié)點(diǎn)最大編號(hào)
          u64 max_lun;              //主機(jī)適配器lun最大編號(hào)
      
          unsigned int unique_id;
      
          /*
           * The maximum length of SCSI commands that this host can accept.
           * Probably 12 for most host adapters, but could be 16 for others.
           * or 260 if the driver supports variable length cdbs.
           * For drivers that don't set this field, a value of 12 is
           * assumed.
           */
          unsigned short max_cmd_len;  //主機(jī)適配器可以接受的最長的SCSI命令
          //下面這段在scsi_host_template中也有,由template中的字段賦值
          int this_id;
          int can_queue;
          short cmd_per_lun;
          short unsigned int sg_tablesize;
          short unsigned int sg_prot_tablesize;
          unsigned int max_sectors;
          unsigned long dma_boundary;
          /*
           * In scsi-mq mode, the number of hardware queues supported by the LLD.
           *
           * Note: it is assumed that each hardware queue has a queue depth of
           * can_queue. In other words, the total queue depth per host
           * is nr_hw_queues * can_queue.
           */
          unsigned nr_hw_queues; //在scsi-mq模式中,低層驅(qū)動(dòng)所支持的硬件隊(duì)列的數(shù)量
          /*
           * Used to assign serial numbers to the cmds.
           * Protected by the host lock.
           */
          unsigned long cmd_serial_number;  //指向命令序列號(hào)unsigned active_mode:2;           //標(biāo)識(shí)是initiator或target
          unsigned unchecked_isa_dma:1;
          unsigned use_clustering:1;
      
          /*
           * Host has requested that no further requests come through for the
           * time being.
           */
          unsigned host_self_blocked:1; //表示低層驅(qū)動(dòng)要求阻塞該主機(jī)適配器,此時(shí)中間層不會(huì)繼續(xù)派發(fā)命令到主機(jī)適配器隊(duì)列中
      
          /*
           * Host uses correct SCSI ordering not PC ordering. The bit is
           * set for the minority of drivers whose authors actually read
           * the spec ;).
           */
          unsigned reverse_ordering:1;
      
          /* Task mgmt function in progress */
          unsigned tmf_in_progress:1;  //任務(wù)管理函數(shù)正在執(zhí)行
      
          /* Asynchronous scan in progress */
          unsigned async_scan:1;       //異步掃描正在執(zhí)行
      
          /* Don't resume host in EH */
          unsigned eh_noresume:1;      //在錯(cuò)誤處理過程不恢復(fù)主機(jī)適配器
      
          /* The controller does not support WRITE SAME */
          unsigned no_write_same:1;
      
          unsigned use_blk_mq:1;       //是否使用SCSI多隊(duì)列模式
          unsigned use_cmd_list:1;
      
          /* Host responded with short (<36 bytes) INQUIRY result */
          unsigned short_inquiry:1;
      
          /*
           * Optional work queue to be utilized by the transport
           */
          char work_q_name[20];  //被scsi傳輸層使用的工作隊(duì)列
          struct workqueue_struct *work_q;
      
          /*
           * Task management function work queue
           */
          struct workqueue_struct *tmf_work_q; //任務(wù)管理函數(shù)工作隊(duì)列
      
          /* The transport requires the LUN bits NOT to be stored in CDB[1] */
          unsigned no_scsi2_lun_in_cdb:1;
      
          /*
           * Value host_blocked counts down from
           */
          unsigned int max_host_blocked; //在派發(fā)隊(duì)列中累計(jì)命令達(dá)到這個(gè)數(shù)值,才開始喚醒主機(jī)適配器
      
          /* Protection Information */
          unsigned int prot_capabilities;
          unsigned char prot_guard_type;
      
          /*
           * q used for scsi_tgt msgs, async events or any other requests that
           * need to be processed in userspace
           */
          struct request_queue *uspace_req_q; //需要在用戶空間處理的scsi_tgt消息、異步事件或其他請(qǐng)求的請(qǐng)求隊(duì)列
      
          /* legacy crap */
          unsigned long base;
          unsigned long io_port;   //I/O端口編號(hào)
          unsigned char n_io_port;
          unsigned char dma_channel;
          unsigned int  irq;
      
      
          enum scsi_host_state shost_state; //狀態(tài)
      
          /* ldm bits */ //shost_gendev: 內(nèi)嵌通用設(shè)備,SCSI設(shè)備通過這個(gè)域鏈入SCSI總線類型(scsi_bus_type)的設(shè)備鏈表
          struct device       shost_gendev, shost_dev;
          //shost_dev: 內(nèi)嵌類設(shè)備, SCSI設(shè)備通過這個(gè)域鏈入SCSI主機(jī)適配器類型(shost_class)的設(shè)備鏈表
          /*
           * List of hosts per template.
           *
           * This is only for use by scsi_module.c for legacy templates.
           * For these access to it is synchronized implicitly by
           * module_init/module_exit.
           */
          struct list_head sht_legacy_list;
      
          /*
           * Points to the transport data (if any) which is allocated
           * separately
           */
          void *shost_data; //指向獨(dú)立分配的傳輸層數(shù)據(jù),由SCSI傳輸層使用
      
          /*
           * Points to the physical bus device we'd use to do DMA
           * Needed just in case we have virtual hosts.
           */
          struct device *dma_dev;
      
          /*
           * We should ensure that this is aligned, both for better performance
           * and also because some compilers (m68k) don't automatically force
           * alignment to a long boundary.
           */ //主機(jī)適配器專有數(shù)據(jù)
          unsigned long hostdata[0]  /* Used for storage of host specific stuff */
              __attribute__ ((aligned (sizeof(unsigned long))));
      };

      目標(biāo)節(jié)點(diǎn)scsi_target

      scsi_target結(jié)構(gòu)中有一個(gè)內(nèi)嵌驅(qū)動(dòng)模型設(shè)備,被鏈入SCSI總線類型scsi_bus_type的設(shè)備鏈表。

      struct scsi_target {
          struct scsi_device  *starget_sdev_user; //指向正在進(jìn)行I/O的scsi設(shè)備,沒有IO則指向NULL
          struct list_head    siblings;  //鏈入主機(jī)適配器target鏈表中
          struct list_head    devices;   //屬于該target的device鏈表
          struct device       dev;       //通用設(shè)備,用于加入設(shè)備驅(qū)動(dòng)模型
          struct kref     reap_ref; /* last put renders target invisible 本結(jié)構(gòu)的引用計(jì)數(shù) */
          unsigned int        channel;   //該target所在的channel號(hào)
          unsigned int        id; /* target id ... replace
                           * scsi_device.id eventually */
          unsigned int        create:1; /* signal that it needs to be added */
          unsigned int        single_lun:1;   /* Indicates we should only
                               * allow I/O to one of the luns
                               * for the device at a time. */
          unsigned int        pdt_1f_for_no_lun:1;    /* PDT = 0x1f
                               * means no lun present. */
          unsigned int        no_report_luns:1;   /* Don't use
                               * REPORT LUNS for scanning. */
          unsigned int        expecting_lun_change:1; /* A device has reported
                               * a 3F/0E UA, other devices on
                               * the same target will also. */
          /* commands actually active on LLD. */
          atomic_t        target_busy;
          atomic_t        target_blocked;           //當(dāng)前阻塞的命令數(shù)
      
          /*
           * LLDs should set this in the slave_alloc host template callout.
           * If set to zero then there is not limit.
           */
          unsigned int        can_queue;             //同時(shí)處理的命令數(shù)
          unsigned int        max_target_blocked;    //阻塞命令數(shù)閥值
      #define SCSI_DEFAULT_TARGET_BLOCKED 3
      
          char            scsi_level;                //支持的SCSI規(guī)范級(jí)別
          enum scsi_target_state  state;             //target狀態(tài)
          void            *hostdata; /* available to low-level driver */
          unsigned long       starget_data[0]; /* for the transport SCSI傳輸層(中間層)使用 */
          /* starget_data must be the last element!!!! */
      } __attribute__((aligned(sizeof(unsigned long))));

      邏輯設(shè)備scsi_device

      scsi_device描述scsi邏輯設(shè)備,代表scsi磁盤的邏輯單元lun。scsi_device描述符所代表的設(shè)備可能是另一臺(tái)存儲(chǔ)設(shè)備上的SATA/SAS/SCSI磁盤或SSD。操作系統(tǒng)在掃描到連接在主機(jī)適配器上的邏輯設(shè)備時(shí),創(chuàng)建scsi_device結(jié)構(gòu),用于scsi高層驅(qū)動(dòng)和該設(shè)備通信。

      struct scsi_device {
          struct Scsi_Host *host;  //所歸屬的主機(jī)總線適配器
          struct request_queue *request_queue; //請(qǐng)求隊(duì)列
      
          /* the next two are protected by the host->host_lock */
          struct list_head    siblings;   /* list of all devices on this host */ //鏈入主機(jī)總線適配器設(shè)備鏈表
          struct list_head    same_target_siblings; /* just the devices sharing same target id */ //鏈入target的設(shè)備鏈表
      
          atomic_t device_busy;       /* commands actually active on LLDD */
          atomic_t device_blocked;    /* Device returned QUEUE_FULL. */
      
          spinlock_t list_lock;
          struct list_head cmd_list;  /* queue of in use SCSI Command structures */
          struct list_head starved_entry; //鏈入主機(jī)適配器的"饑餓"鏈表
          struct scsi_cmnd *current_cmnd; /* currently active command */ //當(dāng)前正在執(zhí)行的命令
          unsigned short queue_depth; /* How deep of a queue we want */
          unsigned short max_queue_depth; /* max queue depth */
          unsigned short last_queue_full_depth; /* These two are used by */
          unsigned short last_queue_full_count; /* scsi_track_queue_full() */
          unsigned long last_queue_full_time; /* last queue full time */
          unsigned long queue_ramp_up_period; /* ramp up period in jiffies */
      #define SCSI_DEFAULT_RAMP_UP_PERIOD (120 * HZ)
      
          unsigned long last_queue_ramp_up;   /* last queue ramp up time */
      
          unsigned int id, channel; //scsi_device所屬的target id和所在channel通道號(hào)
          u64 lun;  //該設(shè)備的lun編號(hào)
          unsigned int manufacturer;  /* Manufacturer of device, for using  制造商
                           * vendor-specific cmd's */
          unsigned sector_size;   /* size in bytes 硬件的扇區(qū)大小 */
      
          void *hostdata;     /* available to low-level driver 專有數(shù)據(jù) */
          char type;          //SCSI設(shè)備類型
          char scsi_level;    //所支持SCSI規(guī)范的版本號(hào),由INQUIRY命令獲得
          char inq_periph_qual;   /* PQ from INQUIRY data */
          unsigned char inquiry_len;  /* valid bytes in 'inquiry' */
          unsigned char * inquiry;    /* INQUIRY response data */
          const char * vendor;        /* [back_compat] point into 'inquiry' ... */
          const char * model;     /* ... after scan; point to static string */
          const char * rev;       /* ... "nullnullnullnull" before scan */
      
      #define SCSI_VPD_PG_LEN                255
          int vpd_pg83_len;          //sense命令 0x83
          unsigned char *vpd_pg83;
          int vpd_pg80_len;          //sense命令 0x80
          unsigned char *vpd_pg80;
          unsigned char current_tag;  /* current tag */
          struct scsi_target      *sdev_target;   /* used only for single_lun */
      
          unsigned int    sdev_bflags; /* black/white flags as also found in
                       * scsi_devinfo.[hc]. For now used only to
                       * pass settings from slave_alloc to scsi
                       * core. */
          unsigned int eh_timeout; /* Error handling timeout */
          unsigned removable:1;
          unsigned changed:1; /* Data invalid due to media change */
          unsigned busy:1;    /* Used to prevent races */
          unsigned lockable:1;    /* Able to prevent media removal */
          unsigned locked:1;      /* Media removal disabled */
          unsigned borken:1;  /* Tell the Seagate driver to be
                       * painfully slow on this device */
          unsigned disconnect:1;  /* can disconnect */
          unsigned soft_reset:1;  /* Uses soft reset option */
          unsigned sdtr:1;    /* Device supports SDTR messages 支持同步數(shù)據(jù)傳輸 */
          unsigned wdtr:1;    /* Device supports WDTR messages 支持16位寬數(shù)據(jù)傳輸*/
          unsigned ppr:1;     /* Device supports PPR messages 支持PPR(并行協(xié)議請(qǐng)求)消息*/
          unsigned tagged_supported:1;    /* Supports SCSI-II tagged queuing */
          unsigned simple_tags:1; /* simple queue tag messages are enabled */
          unsigned was_reset:1;   /* There was a bus reset on the bus for
                       * this device */
          unsigned expecting_cc_ua:1; /* Expecting a CHECK_CONDITION/UNIT_ATTN
                           * because we did a bus reset. */
          unsigned use_10_for_rw:1; /* first try 10-byte read / write */
          unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */
          unsigned no_report_opcodes:1;   /* no REPORT SUPPORTED OPERATION CODES */
          unsigned no_write_same:1;   /* no WRITE SAME command */
          unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */
          unsigned skip_ms_page_8:1;  /* do not use MODE SENSE page 0x08 */
          unsigned skip_ms_page_3f:1; /* do not use MODE SENSE page 0x3f */
          unsigned skip_vpd_pages:1;  /* do not read VPD pages */
          unsigned try_vpd_pages:1;   /* attempt to read VPD pages */
          unsigned use_192_bytes_for_3f:1; /* ask for 192 bytes from page 0x3f */
          unsigned no_start_on_add:1; /* do not issue start on add */
          unsigned allow_restart:1; /* issue START_UNIT in error handler */
          unsigned manage_start_stop:1;   /* Let HLD (sd) manage start/stop */
          unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */
          unsigned no_uld_attach:1; /* disable connecting to upper level drivers */
          unsigned select_no_atn:1;
          unsigned fix_capacity:1;    /* READ_CAPACITY is too high by 1 */
          unsigned guess_capacity:1;  /* READ_CAPACITY might be too high by 1 */
          unsigned retry_hwerror:1;   /* Retry HARDWARE_ERROR */
          unsigned last_sector_bug:1; /* do not use multisector accesses on
                             SD_LAST_BUGGY_SECTORS */
          unsigned no_read_disc_info:1;   /* Avoid READ_DISC_INFO cmds */
          unsigned no_read_capacity_16:1; /* Avoid READ_CAPACITY_16 cmds */
          unsigned try_rc_10_first:1; /* Try READ_CAPACACITY_10 first */
          unsigned is_visible:1;  /* is the device visible in sysfs */
          unsigned wce_default_on:1;  /* Cache is ON by default */
          unsigned no_dif:1;  /* T10 PI (DIF) should be disabled */
          unsigned broken_fua:1;      /* Don't set FUA bit */
          unsigned lun_in_cdb:1;      /* Store LUN bits in CDB[1] */
      
          atomic_t disk_events_disable_depth; /* disable depth for disk events */
      
          DECLARE_BITMAP(supported_events, SDEV_EVT_MAXBITS); /* supported events */
          DECLARE_BITMAP(pending_events, SDEV_EVT_MAXBITS); /* pending events */
          struct list_head event_list;    /* asserted events */
          struct work_struct event_work;
      
          unsigned int max_device_blocked; /* what device_blocked counts down from  */
      #define SCSI_DEFAULT_DEVICE_BLOCKED 3
      
          atomic_t iorequest_cnt;
          atomic_t iodone_cnt;
          atomic_t ioerr_cnt;
      
          struct device       sdev_gendev, //內(nèi)嵌通用設(shè)備, 鏈入scsi總線類型(scsi_bus_type)的設(shè)備鏈表
                      sdev_dev; //內(nèi)嵌類設(shè)備,鏈入scsi設(shè)備類(sdev_class)的設(shè)備鏈表
      
          struct execute_work ew; /* used to get process context on put */
          struct work_struct  requeue_work;
      
          struct scsi_device_handler *handler; //自定義設(shè)備處理函數(shù)
          void            *handler_data;
      
          enum scsi_device_state sdev_state;  //scsi設(shè)備狀態(tài)
          unsigned long       sdev_data[0];   //scsi傳輸層使用
      } __attribute__((aligned(sizeof(unsigned long))));

      內(nèi)核定義的SCSI命令結(jié)構(gòu)scsi_cmnd

      scsi_cmnd結(jié)構(gòu)有SCSI中間層創(chuàng)建,傳遞到SCSI低層驅(qū)動(dòng)。每個(gè)IO請(qǐng)求會(huì)被創(chuàng)建一個(gè)scsi_cnmd,但scsi_cmnd并不一定是時(shí)IO請(qǐng)求。scsi_cmnd最終轉(zhuǎn)化成一個(gè)具體的SCSI命令。除了命令描述塊之外,scsi_cmnd包含更豐富的信息,包括數(shù)據(jù)緩沖區(qū)、感測(cè)數(shù)據(jù)緩沖區(qū)、完成回調(diào)函數(shù)以及所關(guān)聯(lián)的塊設(shè)備驅(qū)動(dòng)層請(qǐng)求等,是SCSI中間層執(zhí)行SCSI命令的上下文。

      struct scsi_cmnd {
          struct scsi_device *device;  //指向命令所屬SCSI設(shè)備的描述符的指針
          struct list_head list;  /* scsi_cmnd participates in queue lists 鏈入scsi設(shè)備的命令鏈表 */
          struct list_head eh_entry; /* entry for the host eh_cmd_q */
          struct delayed_work abort_work;
          int eh_eflags;      /* Used by error handlr */
      
          /*
           * A SCSI Command is assigned a nonzero serial_number before passed
           * to the driver's queue command function.  The serial_number is
           * cleared when scsi_done is entered indicating that the command
           * has been completed.  It is a bug for LLDDs to use this number
           * for purposes other than printk (and even that is only useful
           * for debugging).
           */
          unsigned long serial_number; //scsi命令的唯一序號(hào)
      
          /*
           * This is set to jiffies as it was when the command was first
           * allocated.  It is used to time how long the command has
           * been outstanding
           */
          unsigned long jiffies_at_alloc; //分配時(shí)的jiffies, 用于計(jì)算命令處理時(shí)間
      
          int retries;  //命令重試次數(shù)
          int allowed;  //允許的重試次數(shù)
      
          unsigned char prot_op;    //保護(hù)操作(DIF和DIX)
          unsigned char prot_type;  //DIF保護(hù)類型
          unsigned char prot_flags;
      
          unsigned short cmd_len;   //命令長度
          enum dma_data_direction sc_data_direction;  //命令傳輸方向
      
          /* These elements define the operation we are about to perform */
          unsigned char *cmnd;  //scsi規(guī)范格式的命令字符串
      
      
          /* These elements define the operation we ultimately want to perform */
          struct scsi_data_buffer sdb;        //scsi命令數(shù)據(jù)緩沖區(qū)
          struct scsi_data_buffer *prot_sdb;  //scsi命令保護(hù)信息緩沖區(qū)
      
          unsigned underflow; /* Return error if less than
                         this amount is transferred */
      
          unsigned transfersize;  /* How much we are guaranteed to  //傳輸單位
                         transfer with each SCSI transfer
                         (ie, between disconnect /
                         reconnects.   Probably == sector
                         size */
      
          struct request *request;    /* The command we are  通用塊層的請(qǐng)求描述符
                             working on */
      
      #define SCSI_SENSE_BUFFERSIZE   96
          unsigned char *sense_buffer;    //scsi命令感測(cè)數(shù)據(jù)緩沖區(qū)
                      /* obtained by REQUEST SENSE when
                       * CHECK CONDITION is received on original
                       * command (auto-sense) */
      
          /* Low-level done function - can be used by low-level driver to point
           *        to completion function.  Not used by mid/upper level code. */
          void (*scsi_done) (struct scsi_cmnd *); //scsi命令在低層驅(qū)動(dòng)完成時(shí),回調(diào)
      
          /*
           * The following fields can be written to by the host specific code.
           * Everything else should be left alone.
           */
          struct scsi_pointer SCp;    /* Scratchpad used by some host adapters */
      
          unsigned char *host_scribble;   /* The host adapter is allowed to
                           * call scsi_malloc and get some memory
                           * and hang it here.  The host adapter
                           * is also expected to call scsi_free
                           * to release this memory.  (The memory
                           * obtained by scsi_malloc is guaranteed
                           * to be at an address < 16Mb). */
      
          int result;     /* Status code from lower level driver */
          int flags;      /* Command flags */
      
          unsigned char tag;  /* SCSI-II queued command tag */
      };

      驅(qū)動(dòng)scsi_driver

      struct scsi_driver {
          struct device_driver    gendrv;  // "繼承"device_driver
      
          void (*rescan)(struct device *); //重新掃描前調(diào)用的回調(diào)函數(shù)
          int (*init_command)(struct scsi_cmnd *);
          void (*uninit_command)(struct scsi_cmnd *);
          int (*done)(struct scsi_cmnd *);  //當(dāng)?shù)蛯域?qū)動(dòng)完成一個(gè)scsi命令時(shí)調(diào)用,用于計(jì)算已經(jīng)完成的字節(jié)數(shù)
          int (*eh_action)(struct scsi_cmnd *, int); //錯(cuò)誤處理回調(diào)
      };

      設(shè)備模型

      • scsi_bus_type: scsi子系統(tǒng)總線類型

      struct bus_type scsi_bus_type = {
              .name       = "scsi",   // 對(duì)應(yīng)/sys/bus/scsi
              .match      = scsi_bus_match,
          .uevent     = scsi_bus_uevent,
      #ifdef CONFIG_PM
          .pm     = &scsi_bus_pm_ops,
      #endif
      };
      EXPORT_SYMBOL_GPL(scsi_bus_type);
      • shost_class: scsi子系統(tǒng)類

      static struct class shost_class = {
          .name       = "scsi_host",  // 對(duì)應(yīng)/sys/class/scsi_host
          .dev_release    = scsi_host_cls_release,
      };

      這里寫圖片描述

      初始化過程

      操作系統(tǒng)啟動(dòng)時(shí),會(huì)加載scsi子系統(tǒng),入口函數(shù)是init_scsi,使用subsys_initcall定義:

      static int __init init_scsi(void)
      {
          int error;
      
          error = scsi_init_queue();  //初始化聚散列表所需要的存儲(chǔ)池
          if (error)
              return error;
          error = scsi_init_procfs(); //初始化procfs中與scsi相關(guān)的目錄項(xiàng)
          if (error)
              goto cleanup_queue;
          error = scsi_init_devinfo();//設(shè)置scsi動(dòng)態(tài)設(shè)備信息列表
          if (error)
              goto cleanup_procfs;
          error = scsi_init_hosts();  //注冊(cè)shost_class類,在/sys/class/目錄下創(chuàng)建scsi_host子目錄
          if (error)
              goto cleanup_devlist;
          error = scsi_init_sysctl(); //注冊(cè)SCSI系統(tǒng)控制表
          if (error)
              goto cleanup_hosts;
          error = scsi_sysfs_register(); //注冊(cè)scsi_bus_type總線類型和sdev_class類
          if (error)
              goto cleanup_sysctl;
      
          scsi_netlink_init(); //初始化SCSI傳輸netlink接口
      
          printk(KERN_NOTICE "SCSI subsystem initialized\n");
          return 0;
      
      cleanup_sysctl:
          scsi_exit_sysctl();
      cleanup_hosts:
          scsi_exit_hosts();
      cleanup_devlist:
          scsi_exit_devinfo();
      cleanup_procfs:
          scsi_exit_procfs();
      cleanup_queue:
          scsi_exit_queue();
          printk(KERN_ERR "SCSI subsystem failed to initialize, error = %d\n",
                 -error);
          return error;
      }

      scsi_init_hosts函數(shù)初始化scsi子系統(tǒng)主機(jī)適配器所屬的類shost_class:

      int scsi_init_hosts(void)
      {
          return class_register(&shost_class);
      }

      scsi_sysfs_register函數(shù)初始化scsi子系統(tǒng)總線類型scsi_bus_type和設(shè)備所屬的類sdev_class類:

      int scsi_sysfs_register(void)
      {
          int error;
      
          error = bus_register(&scsi_bus_type);
          if (!error) {
              error = class_register(&sdev_class);
              if (error)
                  bus_unregister(&scsi_bus_type);
          }
      
          return error;
      }

      scsi低層驅(qū)動(dòng)是面向主機(jī)適配器的,低層驅(qū)動(dòng)被加載時(shí),需要添加主機(jī)適配器。主機(jī)適配器添加有兩種方式:1.在PCI子系統(tǒng)掃描掛載驅(qū)動(dòng)時(shí)添加;2.手動(dòng)方式添加。所有基于硬件PCI接口的主機(jī)適配器都采用第一種方式。添加主機(jī)適配器包括兩個(gè)步驟: 
      1. 分別主機(jī)適配器數(shù)據(jù)結(jié)構(gòu)scsi_host_alloc 
      2. 將主機(jī)適配器添加到系統(tǒng)scsi_add_host

      struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
      {
          struct Scsi_Host *shost;
          gfp_t gfp_mask = GFP_KERNEL;
      
          if (sht->unchecked_isa_dma && privsize)
              gfp_mask |= __GFP_DMA;
          //一次分配Scsi_Host和私有數(shù)據(jù)空間
          shost = kzalloc(sizeof(struct Scsi_Host) + privsize, gfp_mask);
          if (!shost)
              return NULL;
      
          shost->host_lock = &shost->default_lock;
          spin_lock_init(shost->host_lock);
          shost->shost_state = SHOST_CREATED; //更新狀態(tài)
          INIT_LIST_HEAD(&shost->__devices);  //初始化scsi設(shè)備鏈表
          INIT_LIST_HEAD(&shost->__targets);  //初始化target鏈表
          INIT_LIST_HEAD(&shost->eh_cmd_q);   //初始化執(zhí)行錯(cuò)誤的scsi命令鏈表
          INIT_LIST_HEAD(&shost->starved_list);   //初始化scsi命令饑餓鏈表
          init_waitqueue_head(&shost->host_wait);
          mutex_init(&shost->scan_mutex);
      
          /*
           * subtract one because we increment first then return, but we need to
           * know what the next host number was before increment
           */ //遞增分配主機(jī)適配器號(hào)
          shost->host_no = atomic_inc_return(&scsi_host_next_hn) - 1;
          shost->dma_channel = 0xff;
      
          /* These three are default values which can be overridden */
          shost->max_channel = 0; //默認(rèn)通道號(hào)為0
          shost->max_id = 8;      //默認(rèn)target最大數(shù)量
          shost->max_lun = 8;     //默認(rèn)scsi_device最大數(shù)量
      
          /* Give each shost a default transportt */
          shost->transportt = &blank_transport_template;  //scsi傳輸層(中間層)模板
      
          /*
           * All drivers right now should be able to handle 12 byte
           * commands.  Every so often there are requests for 16 byte
           * commands, but individual low-level drivers need to certify that
           * they actually do something sensible with such commands.
           */
          shost->max_cmd_len = 12;  //最長的SCSI命令長度
          shost->hostt = sht;       //使用主機(jī)適配器模板
          shost->this_id = sht->this_id;
          shost->can_queue = sht->can_queue;
          shost->sg_tablesize = sht->sg_tablesize;
          shost->sg_prot_tablesize = sht->sg_prot_tablesize;
          shost->cmd_per_lun = sht->cmd_per_lun;
          shost->unchecked_isa_dma = sht->unchecked_isa_dma;
          shost->use_clustering = sht->use_clustering;
          shost->no_write_same = sht->no_write_same;
      
          if (shost_eh_deadline == -1 || !sht->eh_host_reset_handler)
              shost->eh_deadline = -1;
          else if ((ulong) shost_eh_deadline * HZ > INT_MAX) {
              shost_printk(KERN_WARNING, shost,
                       "eh_deadline %u too large, setting to %u\n",
                       shost_eh_deadline, INT_MAX / HZ);
              shost->eh_deadline = INT_MAX;
          } else
              shost->eh_deadline = shost_eh_deadline * HZ;
      
          if (sht->supported_mode == MODE_UNKNOWN) //由模板指定HBA的模式
              /* means we didn't set it ... default to INITIATOR */
              shost->active_mode = MODE_INITIATOR;  //主機(jī)適配器模式默認(rèn)是initiator
          else
              shost->active_mode = sht->supported_mode;
      
          if (sht->max_host_blocked)
              shost->max_host_blocked = sht->max_host_blocked;
          else
              shost->max_host_blocked = SCSI_DEFAULT_HOST_BLOCKED;
      
          /*
           * If the driver imposes no hard sector transfer limit, start at
           * machine infinity initially.
           */
          if (sht->max_sectors)
              shost->max_sectors = sht->max_sectors;
          else
              shost->max_sectors = SCSI_DEFAULT_MAX_SECTORS;
      
          /*
           * assume a 4GB boundary, if not set
           */
          if (sht->dma_boundary)
              shost->dma_boundary = sht->dma_boundary;
          else
              shost->dma_boundary = 0xffffffff;  //默認(rèn)DMA的邊界為4G
      
          shost->use_blk_mq = scsi_use_blk_mq && !shost->hostt->disable_blk_mq;
      
          device_initialize(&shost->shost_gendev); //初始化主機(jī)適配器內(nèi)部通用設(shè)備
          dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
          shost->shost_gendev.bus = &scsi_bus_type;   //設(shè)置主機(jī)適配器的總線類型
          shost->shost_gendev.type = &scsi_host_type; //設(shè)置主機(jī)適配器的設(shè)備類型
      
          device_initialize(&shost->shost_dev);    //初始化主機(jī)適配器的內(nèi)部類設(shè)備
          shost->shost_dev.parent = &shost->shost_gendev; //內(nèi)部類設(shè)備的父設(shè)備設(shè)置為其內(nèi)部通用設(shè)備
          shost->shost_dev.class = &shost_class;   //設(shè)置內(nèi)部類設(shè)備所屬的類是shost_class
          dev_set_name(&shost->shost_dev, "host%d", shost->host_no);
          shost->shost_dev.groups = scsi_sysfs_shost_attr_groups;  //設(shè)置類設(shè)備的屬性組
      
          shost->ehandler = kthread_run(scsi_error_handler, shost,  //啟動(dòng)主機(jī)適配器的錯(cuò)誤恢復(fù)內(nèi)核線程
                  "scsi_eh_%d", shost->host_no);
          if (IS_ERR(shost->ehandler)) {
              shost_printk(KERN_WARNING, shost,
                  "error handler thread failed to spawn, error = %ld\n",
                  PTR_ERR(shost->ehandler));
              goto fail_kfree;
          }
          //分配任務(wù)管理工作隊(duì)列
          shost->tmf_work_q = alloc_workqueue("scsi_tmf_%d",
                              WQ_UNBOUND | WQ_MEM_RECLAIM,
                             1, shost->host_no);
          if (!shost->tmf_work_q) {
              shost_printk(KERN_WARNING, shost,
                       "failed to create tmf workq\n");
              goto fail_kthread;
          }
          scsi_proc_hostdir_add(shost->hostt); //在procfs中添加主機(jī)適配器的目錄, eg. //創(chuàng)建/proc/scsi/<主機(jī)適配器名稱>目錄
          return shost;
      
       fail_kthread:
          kthread_stop(shost->ehandler);
       fail_kfree:
          kfree(shost);
          return NULL;
      }
      EXPORT_SYMBOL(scsi_host_alloc);
      static inline int __must_check scsi_add_host(struct Scsi_Host *host,
                               struct device *dev) //dev為父設(shè)備
      {
          return scsi_add_host_with_dma(host, dev, dev);
      }
      
      int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
                     struct device *dma_dev)
      {
          struct scsi_host_template *sht = shost->hostt;
          int error = -EINVAL;
      
          shost_printk(KERN_INFO, shost, "%s\n",
                  sht->info ? sht->info(shost) : sht->name);
      
          if (!shost->can_queue) {
              shost_printk(KERN_ERR, shost,
                       "can_queue = 0 no longer supported\n");
              goto fail;
          }
      
          if (shost_use_blk_mq(shost)) {         //如果主機(jī)適配器設(shè)置使用多隊(duì)列IO,則建立
              error = scsi_mq_setup_tags(shost); //相應(yīng)的多隊(duì)列環(huán)境
              if (error)
                  goto fail;
          } else {
              shost->bqt = blk_init_tags(shost->can_queue,
                      shost->hostt->tag_alloc_policy);
              if (!shost->bqt) {
                  error = -ENOMEM;
                  goto fail;
              }
          }
      
          /*
           * Note that we allocate the freelist even for the MQ case for now,
           * as we need a command set aside for scsi_reset_provider.  Having
           * the full host freelist and one command available for that is a
           * little heavy-handed, but avoids introducing a special allocator
           * just for this.  Eventually the structure of scsi_reset_provider
           * will need a major overhaul.
           */ //分配存儲(chǔ)scsi命令和sense數(shù)據(jù)的緩沖區(qū), 并分配scsi命令的備用倉庫鏈表
          error = scsi_setup_command_freelist(shost);
          if (error)
              goto out_destroy_tags;
      
          //設(shè)置主機(jī)適配器的父設(shè)備,確定該設(shè)備在sysfs中的位置,通常會(huì)通過dev參數(shù)傳入pci_dev。
          if (!shost->shost_gendev.parent)
              shost->shost_gendev.parent = dev ? dev : &platform_bus; //如果dev為NULL,設(shè)置為platform_bus
          if (!dma_dev)
              dma_dev = shost->shost_gendev.parent;
      
          shost->dma_dev = dma_dev;
      
          error = device_add(&shost->shost_gendev);  //添加主機(jī)適配器通用設(shè)備到系統(tǒng)
          if (error)
              goto out_destroy_freelist;
      
          pm_runtime_set_active(&shost->shost_gendev);
          pm_runtime_enable(&shost->shost_gendev);
          device_enable_async_suspend(&shost->shost_gendev); //支持異步掛起通用設(shè)備
      
          scsi_host_set_state(shost, SHOST_RUNNING);  //設(shè)置主機(jī)適配器狀態(tài)
          get_device(shost->shost_gendev.parent);     //增加通用父設(shè)備的引用計(jì)數(shù)
      
          device_enable_async_suspend(&shost->shost_dev);  //支持異步掛起類設(shè)備
      
          error = device_add(&shost->shost_dev);    //添加主機(jī)適配器類設(shè)備到系統(tǒng)
          if (error)
              goto out_del_gendev;
      
          get_device(&shost->shost_gendev);
      
          if (shost->transportt->host_size) {  //scsi傳輸層使用的數(shù)據(jù)空間
              shost->shost_data = kzalloc(shost->transportt->host_size,
                           GFP_KERNEL);
              if (shost->shost_data == NULL) {
                  error = -ENOMEM;
                  goto out_del_dev;
              }
          }
      
          if (shost->transportt->create_work_queue) {
              snprintf(shost->work_q_name, sizeof(shost->work_q_name),
                   "scsi_wq_%d", shost->host_no);
              shost->work_q = create_singlethread_workqueue( //分配被scsi傳輸層使用的工作隊(duì)列
                          shost->work_q_name);
              if (!shost->work_q) {
                  error = -EINVAL;
                  goto out_free_shost_data;
              }
          }
      
          error = scsi_sysfs_add_host(shost); //添加主機(jī)適配器到子系統(tǒng)
          if (error)
              goto out_destroy_host;
      
          scsi_proc_host_add(shost);  //在procfs添加主機(jī)適配器信息
          return error;
      
       out_destroy_host:
          if (shost->work_q)
              destroy_workqueue(shost->work_q);
       out_free_shost_data:
          kfree(shost->shost_data);
       out_del_dev:
          device_del(&shost->shost_dev);
       out_del_gendev:
          device_del(&shost->shost_gendev);
       out_destroy_freelist:
          scsi_destroy_command_freelist(shost);
       out_destroy_tags:
          if (shost_use_blk_mq(shost))
              scsi_mq_destroy_tags(shost);
       fail:
          return error;
      }
      EXPORT_SYMBOL(scsi_add_host_with_dma);

      設(shè)備探測(cè)過程

      在系統(tǒng)啟動(dòng)過程中,會(huì)掃描默認(rèn)的PCI根總線,從而觸發(fā)了PCI設(shè)備掃描的過程,開始構(gòu)造PCI設(shè)備樹,SCSI主機(jī)適配器是掛載在PCI總線的設(shè)備。SCSI主機(jī)適配器做PCI設(shè)備會(huì)被PCI總線驅(qū)動(dòng)層掃描到(PCI設(shè)備的掃描采用配置空間訪問的方式),掃描到SCSI主機(jī)適配器后,操作系統(tǒng)開始加載SCSI主機(jī)適配器驅(qū)動(dòng),SCSI主機(jī)適配器驅(qū)動(dòng)就是上面所說的低層驅(qū)動(dòng)。SCSI主機(jī)適配器驅(qū)動(dòng)根據(jù)SCSI主機(jī)適配器驅(qū)動(dòng)根據(jù)SCSI主機(jī)適配模板分配SCSI主機(jī)適配器描述符,并添加到系統(tǒng),之后啟動(dòng)通過SCSI主機(jī)適配器擴(kuò)展出來的下一級(jí)總線–SCSI總線的掃描過程。

      SCSI中間層依次以可能的ID和LUN構(gòu)造INQUIRY命令,之后將這些INQUIRY命令提交給塊IO子系統(tǒng),后者又最終將調(diào)用SCSI中間層的策略例程,再次提取到SCSI命令結(jié)構(gòu)后,調(diào)用SCSI低層驅(qū)動(dòng)的queuecommand回調(diào)函數(shù)實(shí)現(xiàn)。 
      對(duì)于給定ID的目標(biāo)節(jié)點(diǎn),如果它在SCSI總線上存在,那么它一定要實(shí)現(xiàn)對(duì)LUN0的INQUIRY響應(yīng)。也就是說,如果向某個(gè)ID的目標(biāo)節(jié)點(diǎn)的LUN0發(fā)送INQUIRY命令,或依次向各個(gè)LUN嘗試發(fā)送INQUIRY命令,檢查是否能收到響應(yīng),最終SCSI中間層能夠得到SCSI域中的所連接的邏輯設(shè)備及其信息。

      SCSI總線具體的掃描方式可以由具體的主機(jī)適配器固件、主機(jī)適配器驅(qū)動(dòng)實(shí)現(xiàn),在此只討論由主機(jī)適配器驅(qū)動(dòng)調(diào)用scsi中間層提供通用的掃描函數(shù)的實(shí)現(xiàn)方式scsi_scan_host。

      void scsi_scan_host(struct Scsi_Host *shost)
      {
          struct async_scan_data *data;
      
          if (strncmp(scsi_scan_type, "none", 4) == 0) //檢查掃描邏輯
              return;
          if (scsi_autopm_get_host(shost) < 0)
              return;
      
          data = scsi_prep_async_scan(shost); //準(zhǔn)備異步掃描
          if (!data) {
              do_scsi_scan_host(shost);    //同步掃描
              scsi_autopm_put_host(shost);
              return;
          }
      
          /* register with the async subsystem so wait_for_device_probe()
           * will flush this work
           */
          async_schedule(do_scan_async, data);  //異步掃描
      
          /* scsi_autopm_put_host(shost) is called in scsi_finish_async_scan() */
      }
      EXPORT_SYMBOL(scsi_scan_host);

      scsi_scan_host函數(shù)是scsi中間層提供的主機(jī)適配器掃描函數(shù),對(duì)于有主機(jī)適配器驅(qū)動(dòng)有自定義掃描邏輯需求的可以設(shè)置主機(jī)適配器模板的回調(diào)函數(shù),由scsi_scan_host函數(shù)來調(diào)用回調(diào)實(shí)現(xiàn)自定義掃描。 
      scsi_scan_type變量指定了掃描方式:async、sync、none。無論最終掃描方式是同步還是異步,都是由do_scsi_scan_host函數(shù)實(shí)現(xiàn):

      static void do_scsi_scan_host(struct Scsi_Host *shost)
      {
          if (shost->hostt->scan_finished) {  //使用自定義掃描方式
              unsigned long start = jiffies;
              if (shost->hostt->scan_start)
                  shost->hostt->scan_start(shost); //自定義掃描開始回調(diào)
      
              while (!shost->hostt->scan_finished(shost, jiffies - start)) //自定義掃描完成時(shí)返回1
                  msleep(10);
          } else { //scsi子系統(tǒng)通用掃描函數(shù), SCAN_WILD_CARD表示掃描所有的target和device
              scsi_scan_host_selected(shost, SCAN_WILD_CARD, SCAN_WILD_CARD,
                      SCAN_WILD_CARD, 0);
          }
      }

      如果主機(jī)適配器模板設(shè)置了自定義掃描函數(shù),do_scsi_scan_host函數(shù)將會(huì)調(diào)用。如果沒有設(shè)置則使用默認(rèn)的掃描函數(shù)scsi_scan_host_selected執(zhí)行掃描。

      int scsi_scan_host_selected(struct Scsi_Host *shost, unsigned int channel,
                      unsigned int id, u64 lun, int rescan)
      {
          SCSI_LOG_SCAN_BUS(3, shost_printk (KERN_INFO, shost,
              "%s: <%u:%u:%llu>\n",
              __func__, channel, id, lun));
          //檢查channel、id、lun是否有效
          if (((channel != SCAN_WILD_CARD) && (channel > shost->max_channel)) ||
              ((id != SCAN_WILD_CARD) && (id >= shost->max_id)) ||
              ((lun != SCAN_WILD_CARD) && (lun >= shost->max_lun)))
              return -EINVAL;
      
          mutex_lock(&shost->scan_mutex);
          if (!shost->async_scan)
              scsi_complete_async_scans();
          //檢查Scsi_Host的狀態(tài)是否允許掃描
          if (scsi_host_scan_allowed(shost) && scsi_autopm_get_host(shost) == 0) {
              if (channel == SCAN_WILD_CARD)
                  for (channel = 0; channel <= shost->max_channel; //遍歷所有的channel進(jìn)行掃描
                       channel++)
                      scsi_scan_channel(shost, channel, id, lun,  //掃描channel
                                rescan);
              else
                  scsi_scan_channel(shost, channel, id, lun, rescan); //掃描指定的channel
              scsi_autopm_put_host(shost);
          }
          mutex_unlock(&shost->scan_mutex);
      
          return 0;
      }

      scsi_scan_host_selected函數(shù)掃描指定的主機(jī)適配器,根據(jù)輸入的參數(shù)決定是否遍歷掃描所有channel或掃描指定channel,通過函數(shù)scsi_scan_channel完成。

      static void scsi_scan_channel(struct Scsi_Host *shost, unsigned int channel,
                        unsigned int id, u64 lun, int rescan)
      {
          uint order_id;
      
          if (id == SCAN_WILD_CARD)
              for (id = 0; id < shost->max_id; ++id) {  //遍歷所有的target
                  /*
                   * XXX adapter drivers when possible (FCP, iSCSI)
                   * could modify max_id to match the current max,
                   * not the absolute max.
                   *
                   * XXX add a shost id iterator, so for example,
                   * the FC ID can be the same as a target id
                   * without a huge overhead of sparse id's.
                   */
                  if (shost->reverse_ordering)
                      /*
                       * Scan from high to low id.
                       */
                      order_id = shost->max_id - id - 1;
                  else
                      order_id = id;
                  __scsi_scan_target(&shost->shost_gendev, channel, //掃描指定的target
                          order_id, lun, rescan);
              }
          else
              __scsi_scan_target(&shost->shost_gendev, channel,
                      id, lun, rescan);
      }

      __scsi_scan_target函數(shù)指定掃描target內(nèi)部的lun。

      static void __scsi_scan_target(struct device *parent, unsigned int channel,
              unsigned int id, u64 lun, int rescan)
      {
          struct Scsi_Host *shost = dev_to_shost(parent);
          int bflags = 0;
          int res;
          struct scsi_target *starget;
      
          if (shost->this_id == id)
              /*
               * Don't scan the host adapter
               */
              return;
          //為指定的id分配target數(shù)據(jù)結(jié)構(gòu),并初始化
          starget = scsi_alloc_target(parent, channel, id);
          if (!starget)
              return;
          scsi_autopm_get_target(starget);
      
          if (lun != SCAN_WILD_CARD) {
              /*
               * Scan for a specific host/chan/id/lun.
               */ //掃描target中指定id的scsi_device(lun),并將scsi_device(lun)添加到子系統(tǒng)
              scsi_probe_and_add_lun(starget, lun, NULL, NULL, rescan, NULL);
              goto out_reap;
          }
      
          /*
           * Scan LUN 0, if there is some response, scan further. Ideally, we
           * would not configure LUN 0 until all LUNs are scanned.
           */ //探測(cè)target的LUN0
          res = scsi_probe_and_add_lun(starget, 0, &bflags, NULL, rescan, NULL);
          if (res == SCSI_SCAN_LUN_PRESENT || res == SCSI_SCAN_TARGET_PRESENT) {
              if (scsi_report_lun_scan(starget, bflags, rescan) != 0) //向target lun 0發(fā)送REPORT_LUNS
                  /*
                   * The REPORT LUN did not scan the target,
                   * do a sequential scan.
                   */
                  scsi_sequential_lun_scan(starget, bflags,  //探測(cè)REPORT_LUNS上報(bào)的lun
                               starget->scsi_level, rescan);
          }
      
       out_reap:
          scsi_autopm_put_target(starget);
          /*
           * paired with scsi_alloc_target(): determine if the target has
           * any children at all and if not, nuke it
           */
          scsi_target_reap(starget);
      
          put_device(&starget->dev);
      }

      掃描到target時(shí)分配并初始化scsi_target結(jié)構(gòu),scsi_probe_and_add_lun函數(shù)完成探測(cè)target中的lun,并將發(fā)現(xiàn)的lun添加到系統(tǒng)。

      static int scsi_probe_and_add_lun(struct scsi_target *starget,
                        u64 lun, int *bflagsp,
                        struct scsi_device **sdevp, int rescan,
                        void *hostdata)
      {
          struct scsi_device *sdev;
          unsigned char *result;
          int bflags, res = SCSI_SCAN_NO_RESPONSE, result_len = 256;
          struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
      
          /*
           * The rescan flag is used as an optimization, the first scan of a
           * host adapter calls into here with rescan == 0.
           */
          sdev = scsi_device_lookup_by_target(starget, lun);  //尋找target中指定id的lun
          if (sdev) {   //target中已經(jīng)存在lun
              if (rescan || !scsi_device_created(sdev)) { //rescan參數(shù)要求重新掃描該lun
                  SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
                      "scsi scan: device exists on %s\n",
                      dev_name(&sdev->sdev_gendev)));
                  if (sdevp)
                      *sdevp = sdev;
                  else
                      scsi_device_put(sdev);
      
                  if (bflagsp)
                      *bflagsp = scsi_get_device_flags(sdev,
                                       sdev->vendor,
                                       sdev->model);
                  return SCSI_SCAN_LUN_PRESENT;
              }
              scsi_device_put(sdev);
          } else
              sdev = scsi_alloc_sdev(starget, lun, hostdata); //target中不存在lun,分配scsi_device
          if (!sdev)
              goto out;
      
          result = kmalloc(result_len, GFP_ATOMIC |
                  ((shost->unchecked_isa_dma) ? __GFP_DMA : 0));
          if (!result)
              goto out_free_sdev;
      
          if (scsi_probe_lun(sdev, result, result_len, &bflags)) //發(fā)送INQUIRY到具體device,進(jìn)行探測(cè)
              goto out_free_result;
      
          if (bflagsp)
              *bflagsp = bflags;
          /*
           * result contains valid SCSI INQUIRY data.
           */
          if (((result[0] >> 5) == 3) && !(bflags & BLIST_ATTACH_PQ3)) {
              /*
               * For a Peripheral qualifier 3 (011b), the SCSI
               * spec says: The device server is not capable of
               * supporting a physical device on this logical
               * unit.
               *
               * For disks, this implies that there is no
               * logical disk configured at sdev->lun, but there
               * is a target id responding.
               */
              SCSI_LOG_SCAN_BUS(2, sdev_printk(KERN_INFO, sdev, "scsi scan:"
                         " peripheral qualifier of 3, device not"
                         " added\n"))
              if (lun == 0) {
                  SCSI_LOG_SCAN_BUS(1, {
                      unsigned char vend[9];
                      unsigned char mod[17];
      
                      sdev_printk(KERN_INFO, sdev,
                          "scsi scan: consider passing scsi_mod."
                          "dev_flags=%s:%s:0x240 or 0x1000240\n",
                          scsi_inq_str(vend, result, 8, 16),
                          scsi_inq_str(mod, result, 16, 32));
                  });
      
              }
      
              res = SCSI_SCAN_TARGET_PRESENT;
              goto out_free_result;
          }
      
          /*
           * Some targets may set slight variations of PQ and PDT to signal
           * that no LUN is present, so don't add sdev in these cases.
           * Two specific examples are:
           * 1) NetApp targets: return PQ=1, PDT=0x1f
           * 2) USB UFI: returns PDT=0x1f, with the PQ bits being "reserved"
           *    in the UFI 1.0 spec (we cannot rely on reserved bits).
           *
           * References:
           * 1) SCSI SPC-3, pp. 145-146
           * PQ=1: "A peripheral device having the specified peripheral
           * device type is not connected to this logical unit. However, the
           * device server is capable of supporting the specified peripheral
           * device type on this logical unit."
           * PDT=0x1f: "Unknown or no device type"
           * 2) USB UFI 1.0, p. 20
           * PDT=00h Direct-access device (floppy)
           * PDT=1Fh none (no FDD connected to the requested logical unit)
           */
          if (((result[0] >> 5) == 1 || starget->pdt_1f_for_no_lun) &&
              (result[0] & 0x1f) == 0x1f &&
              !scsi_is_wlun(lun)) {
              SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
                          "scsi scan: peripheral device type"
                          " of 31, no device added\n"));
              res = SCSI_SCAN_TARGET_PRESENT;
              goto out_free_result;
          }
          //添加scsi設(shè)備到子系統(tǒng)
          res = scsi_add_lun(sdev, result, &bflags, shost->async_scan);
          if (res == SCSI_SCAN_LUN_PRESENT) {
              if (bflags & BLIST_KEY) {
                  sdev->lockable = 0;
                  scsi_unlock_floptical(sdev, result);
              }
          }
      
       out_free_result:
          kfree(result);
       out_free_sdev:
          if (res == SCSI_SCAN_LUN_PRESENT) {
              if (sdevp) {
                  if (scsi_device_get(sdev) == 0) {
                      *sdevp = sdev;
                  } else {
                      __scsi_remove_device(sdev);
                      res = SCSI_SCAN_NO_RESPONSE;
                  }
              }
          } else
              __scsi_remove_device(sdev);
       out:
          return res;
      }

      scsi_probe_and_add_lun函數(shù)由名字可知,完成lun的probe和add兩個(gè)操作: 
      1. 探測(cè)邏輯設(shè)備scsi_probe_lun,發(fā)送INQUIRY命令到具體設(shè)備。 
      2. 添加邏輯設(shè)備到系統(tǒng)scsi_add_lun,根據(jù)INQUIRY命令返回值添加lun到系統(tǒng)。

      static int scsi_probe_lun(struct scsi_device *sdev, unsigned char *inq_result,
                    int result_len, int *bflags)
      {
          unsigned char scsi_cmd[MAX_COMMAND_SIZE];
          int first_inquiry_len, try_inquiry_len, next_inquiry_len;
          int response_len = 0;
          int pass, count, result;
          struct scsi_sense_hdr sshdr;
      
          *bflags = 0;
      
          /* Perform up to 3 passes.  The first pass uses a conservative
           * transfer length of 36 unless sdev->inquiry_len specifies a
           * different value. */
          first_inquiry_len = sdev->inquiry_len ? sdev->inquiry_len : 36;
          try_inquiry_len = first_inquiry_len;
          pass = 1;
      
       next_pass:
          SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
                      "scsi scan: INQUIRY pass %d length %d\n",
                      pass, try_inquiry_len));
      
          /* Each pass gets up to three chances to ignore Unit Attention */
          for (count = 0; count < 3; ++count) {
              int resid;
      
              memset(scsi_cmd, 0, 6);
              scsi_cmd[0] = INQUIRY;      //命令類型是INQUIRY
              scsi_cmd[4] = (unsigned char) try_inquiry_len;
      
              memset(inq_result, 0, try_inquiry_len);
              //發(fā)送SCSI命令,重試3次
              result = scsi_execute_req(sdev,  scsi_cmd, DMA_FROM_DEVICE,
                            inq_result, try_inquiry_len, &sshdr,
                            HZ / 2 + HZ * scsi_inq_timeout, 3,
                            &resid);
      
              SCSI_LOG_SCAN_BUS(3, sdev_printk(KERN_INFO, sdev,
                      "scsi scan: INQUIRY %s with code 0x%x\n",
                      result ? "failed" : "successful", result));
      
              if (result) {
                  /*
                   * not-ready to ready transition [asc/ascq=0x28/0x0]
                   * or power-on, reset [asc/ascq=0x29/0x0], continue.
                   * INQUIRY should not yield UNIT_ATTENTION
                   * but many buggy devices do so anyway.
                   */
                  if ((driver_byte(result) & DRIVER_SENSE) &&
                      scsi_sense_valid(&sshdr)) {
                      if ((sshdr.sense_key == UNIT_ATTENTION) &&
                          ((sshdr.asc == 0x28) ||
                           (sshdr.asc == 0x29)) &&
                          (sshdr.ascq == 0))
                          continue;
                  }
              } else {
                  /*
                   * if nothing was transferred, we try
                   * again. It's a workaround for some USB
                   * devices.
                   */
                  if (resid == try_inquiry_len)
                      continue;
              }
              break;
          }
      
          if (result == 0) {
              sanitize_inquiry_string(&inq_result[8], 8);
              sanitize_inquiry_string(&inq_result[16], 16);
              sanitize_inquiry_string(&inq_result[32], 4);
      
              response_len = inq_result[4] + 5;
              if (response_len > 255)
                  response_len = first_inquiry_len;   /* sanity */
      
              /*
               * Get any flags for this device.
               *
               * XXX add a bflags to scsi_device, and replace the
               * corresponding bit fields in scsi_device, so bflags
               * need not be passed as an argument.
               */
              *bflags = scsi_get_device_flags(sdev, &inq_result[8],
                      &inq_result[16]);
      
              /* When the first pass succeeds we gain information about
               * what larger transfer lengths might work. */
              if (pass == 1) {
                  if (BLIST_INQUIRY_36 & *bflags)
                      next_inquiry_len = 36;
                  else if (BLIST_INQUIRY_58 & *bflags)
                      next_inquiry_len = 58;
                  else if (sdev->inquiry_len)
                      next_inquiry_len = sdev->inquiry_len;
                  else
                      next_inquiry_len = response_len;
      
                  /* If more data is available perform the second pass */
                  if (next_inquiry_len > try_inquiry_len) {
                      try_inquiry_len = next_inquiry_len;
                      pass = 2;
                      goto next_pass;
                  }
              }
      
          } else if (pass == 2) {
              sdev_printk(KERN_INFO, sdev,
                      "scsi scan: %d byte inquiry failed.  "
                      "Consider BLIST_INQUIRY_36 for this device\n",
                      try_inquiry_len);
      
              /* If this pass failed, the third pass goes back and transfers
               * the same amount as we successfully got in the first pass. */
              try_inquiry_len = first_inquiry_len;
              pass = 3;
              goto next_pass;
          }
      
          /* If the last transfer attempt got an error, assume the
           * peripheral doesn't exist or is dead. */
          if (result)
              return -EIO;
      
          /* Don't report any more data than the device says is valid */
          sdev->inquiry_len = min(try_inquiry_len, response_len);
      
          /*
           * XXX Abort if the response length is less than 36? If less than
           * 32, the lookup of the device flags (above) could be invalid,
           * and it would be possible to take an incorrect action - we do
           * not want to hang because of a short INQUIRY. On the flip side,
           * if the device is spun down or becoming ready (and so it gives a
           * short INQUIRY), an abort here prevents any further use of the
           * device, including spin up.
           *
           * On the whole, the best approach seems to be to assume the first
           * 36 bytes are valid no matter what the device says.  That's
           * better than copying < 36 bytes to the inquiry-result buffer
           * and displaying garbage for the Vendor, Product, or Revision
           * strings.
           */
          if (sdev->inquiry_len < 36) {
              if (!sdev->host->short_inquiry) {
                  shost_printk(KERN_INFO, sdev->host,
                          "scsi scan: INQUIRY result too short (%d),"
                          " using 36\n", sdev->inquiry_len);
                  sdev->host->short_inquiry = 1;
              }
              sdev->inquiry_len = 36;
          }
      
          /*
           * Related to the above issue:
           *
           * XXX Devices (disk or all?) should be sent a TEST UNIT READY,
           * and if not ready, sent a START_STOP to start (maybe spin up) and
           * then send the INQUIRY again, since the INQUIRY can change after
           * a device is initialized.
           *
           * Ideally, start a device if explicitly asked to do so.  This
           * assumes that a device is spun up on power on, spun down on
           * request, and then spun up on request.
           */
      
          /*
           * The scanning code needs to know the scsi_level, even if no
           * device is attached at LUN 0 (SCSI_SCAN_TARGET_PRESENT) so
           * non-zero LUNs can be scanned.
           */
          sdev->scsi_level = inq_result[2] & 0x07;
          if (sdev->scsi_level >= 2 ||
              (sdev->scsi_level == 1 && (inq_result[3] & 0x0f) == 1))
              sdev->scsi_level++;
          sdev->sdev_target->scsi_level = sdev->scsi_level;
      
          /*
           * If SCSI-2 or lower, and if the transport requires it,
           * store the LUN value in CDB[1].
           */
          sdev->lun_in_cdb = 0;
          if (sdev->scsi_level <= SCSI_2 &&
              sdev->scsi_level != SCSI_UNKNOWN &&
              !sdev->host->no_scsi2_lun_in_cdb)
              sdev->lun_in_cdb = 1;
      
          return 0;
      }
      
      
      static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
              int *bflags, int async)
      {
          int ret;
      
          /*
           * XXX do not save the inquiry, since it can change underneath us,
           * save just vendor/model/rev.
           *
           * Rather than save it and have an ioctl that retrieves the saved
           * value, have an ioctl that executes the same INQUIRY code used
           * in scsi_probe_lun, let user level programs doing INQUIRY
           * scanning run at their own risk, or supply a user level program
           * that can correctly scan.
           */
      
          /*
           * Copy at least 36 bytes of INQUIRY data, so that we don't
           * dereference unallocated memory when accessing the Vendor,
           * Product, and Revision strings.  Badly behaved devices may set
           * the INQUIRY Additional Length byte to a small value, indicating
           * these strings are invalid, but often they contain plausible data
           * nonetheless.  It doesn't matter if the device sent < 36 bytes
           * total, since scsi_probe_lun() initializes inq_result with 0s.
           */
          sdev->inquiry = kmemdup(inq_result,
                      max_t(size_t, sdev->inquiry_len, 36),
                      GFP_ATOMIC);
          if (sdev->inquiry == NULL)
              return SCSI_SCAN_NO_RESPONSE;
      
          sdev->vendor = (char *) (sdev->inquiry + 8); //第8個(gè)字節(jié)到第15個(gè)字節(jié)是vendor identification
          sdev->model = (char *) (sdev->inquiry + 16); //第16個(gè)字節(jié)到第31個(gè)字節(jié)是product identification
          sdev->rev = (char *) (sdev->inquiry + 32);   //第32個(gè)字節(jié)到第35個(gè)字節(jié)是product revision level
      
          if (strncmp(sdev->vendor, "ATA     ", 8) == 0) {
              /*
               * sata emulation layer device.  This is a hack to work around
               * the SATL power management specifications which state that
               * when the SATL detects the device has gone into standby
               * mode, it shall respond with NOT READY.
               */
              sdev->allow_restart = 1;
          }
      
          if (*bflags & BLIST_ISROM) {
              sdev->type = TYPE_ROM;
              sdev->removable = 1;
          } else {
              sdev->type = (inq_result[0] & 0x1f);
              sdev->removable = (inq_result[1] & 0x80) >> 7;
      
              /*
               * some devices may respond with wrong type for
               * well-known logical units. Force well-known type
               * to enumerate them correctly.
               */
              if (scsi_is_wlun(sdev->lun) && sdev->type != TYPE_WLUN) {
                  sdev_printk(KERN_WARNING, sdev,
                      "%s: correcting incorrect peripheral device type 0x%x for W-LUN 0x%16xhN\n",
                      __func__, sdev->type, (unsigned int)sdev->lun);
                  sdev->type = TYPE_WLUN;
              }
      
          }
      
          if (sdev->type == TYPE_RBC || sdev->type == TYPE_ROM) {
              /* RBC and MMC devices can return SCSI-3 compliance and yet
               * still not support REPORT LUNS, so make them act as
               * BLIST_NOREPORTLUN unless BLIST_REPORTLUN2 is
               * specifically set */
              if ((*bflags & BLIST_REPORTLUN2) == 0)
                  *bflags |= BLIST_NOREPORTLUN;
          }
      
          /*
           * For a peripheral qualifier (PQ) value of 1 (001b), the SCSI
           * spec says: The device server is capable of supporting the
           * specified peripheral device type on this logical unit. However,
           * the physical device is not currently connected to this logical
           * unit.
           *
           * The above is vague, as it implies that we could treat 001 and
           * 011 the same. Stay compatible with previous code, and create a
           * scsi_device for a PQ of 1
           *
           * Don't set the device offline here; rather let the upper
           * level drivers eval the PQ to decide whether they should
           * attach. So remove ((inq_result[0] >> 5) & 7) == 1 check.
           */
      
          sdev->inq_periph_qual = (inq_result[0] >> 5) & 7;
          sdev->lockable = sdev->removable;
          sdev->soft_reset = (inq_result[7] & 1) && ((inq_result[3] & 7) == 2);
      
          if (sdev->scsi_level >= SCSI_3 ||
                  (sdev->inquiry_len > 56 && inq_result[56] & 0x04))
              sdev->ppr = 1;
          if (inq_result[7] & 0x60)
              sdev->wdtr = 1;
          if (inq_result[7] & 0x10)
              sdev->sdtr = 1;
      
          sdev_printk(KERN_NOTICE, sdev, "%s %.8s %.16s %.4s PQ: %d "
                  "ANSI: %d%s\n", scsi_device_type(sdev->type),
                  sdev->vendor, sdev->model, sdev->rev,
                  sdev->inq_periph_qual, inq_result[2] & 0x07,
                  (inq_result[3] & 0x0f) == 1 ? " CCS" : "");
      
          if ((sdev->scsi_level >= SCSI_2) && (inq_result[7] & 2) &&
              !(*bflags & BLIST_NOTQ)) {
              sdev->tagged_supported = 1;
              sdev->simple_tags = 1;
          }
      
          /*
           * Some devices (Texel CD ROM drives) have handshaking problems
           * when used with the Seagate controllers. borken is initialized
           * to 1, and then set it to 0 here.
           */
          if ((*bflags & BLIST_BORKEN) == 0)
              sdev->borken = 0;
      
          if (*bflags & BLIST_NO_ULD_ATTACH)
              sdev->no_uld_attach = 1;
      
          /*
           * Apparently some really broken devices (contrary to the SCSI
           * standards) need to be selected without asserting ATN
           */
          if (*bflags & BLIST_SELECT_NO_ATN)
              sdev->select_no_atn = 1;
      
          /*
           * Maximum 512 sector transfer length
           * broken RA4x00 Compaq Disk Array
           */
          if (*bflags & BLIST_MAX_512)
              blk_queue_max_hw_sectors(sdev->request_queue, 512);
          /*
           * Max 1024 sector transfer length for targets that report incorrect
           * max/optimal lengths and relied on the old block layer safe default
           */
          else if (*bflags & BLIST_MAX_1024)
              blk_queue_max_hw_sectors(sdev->request_queue, 1024);
      
          /*
           * Some devices may not want to have a start command automatically
           * issued when a device is added.
           */
          if (*bflags & BLIST_NOSTARTONADD)
              sdev->no_start_on_add = 1;
      
          if (*bflags & BLIST_SINGLELUN)
              scsi_target(sdev)->single_lun = 1;
      
          sdev->use_10_for_rw = 1;
      
          if (*bflags & BLIST_MS_SKIP_PAGE_08)
              sdev->skip_ms_page_8 = 1;
      
          if (*bflags & BLIST_MS_SKIP_PAGE_3F)
              sdev->skip_ms_page_3f = 1;
      
          if (*bflags & BLIST_USE_10_BYTE_MS)
              sdev->use_10_for_ms = 1;
      
          /* some devices don't like REPORT SUPPORTED OPERATION CODES
           * and will simply timeout causing sd_mod init to take a very
           * very long time */
          if (*bflags & BLIST_NO_RSOC)
              sdev->no_report_opcodes = 1;
      
          /* set the device running here so that slave configure
           * may do I/O */
          ret = scsi_device_set_state(sdev, SDEV_RUNNING); //狀態(tài)
          if (ret) {
              ret = scsi_device_set_state(sdev, SDEV_BLOCK);
      
              if (ret) {
                  sdev_printk(KERN_ERR, sdev,
                          "in wrong state %s to complete scan\n",
                          scsi_device_state_name(sdev->sdev_state));
                  return SCSI_SCAN_NO_RESPONSE;
              }
          }
      
          if (*bflags & BLIST_MS_192_BYTES_FOR_3F)
              sdev->use_192_bytes_for_3f = 1;
      
          if (*bflags & BLIST_NOT_LOCKABLE)
              sdev->lockable = 0;
      
          if (*bflags & BLIST_RETRY_HWERROR)
              sdev->retry_hwerror = 1;
      
          if (*bflags & BLIST_NO_DIF)
              sdev->no_dif = 1;
      
          sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;
      
          if (*bflags & BLIST_TRY_VPD_PAGES)
              sdev->try_vpd_pages = 1;
          else if (*bflags & BLIST_SKIP_VPD_PAGES)
              sdev->skip_vpd_pages = 1;
      
          transport_configure_device(&sdev->sdev_gendev); //把lun配置到scsi傳輸層
      
          if (sdev->host->hostt->slave_configure) {
              ret = sdev->host->hostt->slave_configure(sdev); //主機(jī)適配器模板設(shè)置的回調(diào),對(duì)scsi_device(lun)執(zhí)行特定的初始化
              if (ret) {
                  /*
                   * if LLDD reports slave not present, don't clutter
                   * console with alloc failure messages
                   */
                  if (ret != -ENXIO) {
                      sdev_printk(KERN_ERR, sdev,
                          "failed to configure device\n");
                  }
                  return SCSI_SCAN_NO_RESPONSE;
              }
          }
      
          if (sdev->scsi_level >= SCSI_3)
              scsi_attach_vpd(sdev);
      
          sdev->max_queue_depth = sdev->queue_depth;  //設(shè)置最大隊(duì)列深度
      
          /*
           * Ok, the device is now all set up, we can
           * register it and tell the rest of the kernel
           * about it.
           */ //添加scsi_device(lun)到sysfs
          if (!async && scsi_sysfs_add_sdev(sdev) != 0)
              return SCSI_SCAN_NO_RESPONSE;
      
          return SCSI_SCAN_LUN_PRESENT;
      }

        本站是提供個(gè)人知識(shí)管理的網(wǎng)絡(luò)存儲(chǔ)空間,所有內(nèi)容均由用戶發(fā)布,不代表本站觀點(diǎn)。請(qǐng)注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購買等信息,謹(jǐn)防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,請(qǐng)點(diǎn)擊一鍵舉報(bào)。
        轉(zhuǎn)藏 分享 獻(xiàn)花(0

        0條評(píng)論

        發(fā)表

        請(qǐng)遵守用戶 評(píng)論公約

        類似文章 更多