首页 / 操作系统 / Linux / Linux网络协议栈之驱动框架
网卡驱动可以以模块的方式加载也可以内核初始化的时候加载,我们选定e100系列的网卡进行说明网卡驱动的一般框架。网卡设备通用数据结构:struct net_device{ /* * This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name * the interface. */ /*网络设备名*/ char name[IFNAMSIZ]; /* device name hash chain */ /*根据网络设备名以散列表的形式组织到dev_name_head散列表中,这样就可以通过网络 设备名快速地定位到网络设备*/ struct hlist_node name_hlist; /* * I/O specific fields * FIXME: Merge these and struct ifmap into one */ /*网络设备共享内存的起始和终止地址*/ unsigned long mem_end; /* shared mem end */ unsigned long mem_start; /* shared mem start */ /*网络接口I/O基地址,在探测设备时被初始化ifconfig命令可显示和修改 当前命令*/ unsigned long base_addr; /* device I/O address */ /*分配给设备的中断号,一般在初始化设备时被初始化*/ unsigned int irq; /* device IRQ number */ /* * Some hardware also needs these fields, but they are not * part of the usual set specified in Space.c. */ /*指定在多端口设备上使用那个端口*/ unsigned char if_port; /* Selectable AUI, TP,..*/ /*为设备分配的DMA通道*/ unsigned char dma; /* DMA channel */ /*设备状态*/ unsigned long state; /*网络设备组织*/ struct net_device *next; /*驱动程序的初始化函数*/ /* The device initialization function. Called only once. */ int (*init)(struct net_device *dev); /* ------- Fields preinitialized in Space.c finish here ------- */ /* Net device features */ /*接口支持特性*/ unsigned long features;#define NETIF_F_SG 1 /* Scatter/gather IO. */#define NETIF_F_IP_CSUM 2 /* Can checksum only TCP/UDP over IPv4. */#define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */#define NETIF_F_HW_CSUM 8 /* Can checksum all the packets. */#define NETIF_F_HIGHDMA 32 /* Can DMA to high memory. */#define NETIF_F_FRAGLIST 64 /* Scatter/gather IO. */#define NETIF_F_HW_VLAN_TX 128 /* Transmit VLAN hw acceleration */#define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */#define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */#define NETIF_F_GSO 2048 /* Enable software GSO. */#define NETIF_F_LLTX 4096 /* LockLess TX */ /* Segmentation offload features */#define NETIF_F_GSO_SHIFT 16#define NETIF_F_GSO_MASK 0xffff0000#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)#define NETIF_F_UFO (SKB_GSO_UDP << NETIF_F_GSO_SHIFT)#define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)#define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)#define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT) /* List of features with software fallbacks. */#define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6) #define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)#define NETIF_F_ALL_CSUM (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM) /*用于连接那些已经调度有数据报输出的网络设备指针*/ struct net_device *next_sched; /* Interface index. Unique device identifier */ /*网络设备的索引号*/ int ifindex; /*网络设备的唯一标识,主要用于虚拟隧道设备*/ int iflink; /*提供给应用程序获得接口统计信息的接口*/ struct net_device_stats* (*get_stats)(struct net_device *dev); /* List of functions to handle Wireless Extensions (instead of ioctl). * See <net/iw_handler.h> for details. Jean II */ /*无线网相关*/ const struct iw_handler_def * wireless_handlers; /* Instance data managed by the core of Wireless Extensions. */ struct iw_public_data * wireless_data; const struct ethtool_ops *ethtool_ops; /* * This marks the end of the "visible" part of the structure. All * fields hereafter are internal to the system, and may change at * will (read: may be cleaned up at will). */ unsigned int flags; /* interface flags (a la BSD) */ /*记录当前网络设备IFF_PROMISC和IFF_ALLMULTI的状态,用来配合flags的设置*/ unsigned short gflags; unsigned short priv_flags; /* Like "flags" but invisible to userspace. */ unsigned short padded; /* How much padding added by alloc_netdev() */ unsigned char operstate; /* RFC2863 operstate */ unsigned char link_mode; /* mapping policy to operstate */ unsigned mtu; /* interface MTU value */ unsigned short type; /* interface hardware type */ unsigned short hard_header_len; /* hardware hdr length */ struct net_device *master; /* Pointer to master device of a group, * which this device is member of. */ /* Interface address info. */ /*MAC地址,通常初始化时从硬件中读出来*/ unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */ unsigned char addr_len; /* hardware address length */ unsigned short dev_id; /* for shared network cards */ struct dev_mc_list *mc_list; /* Multicast mac addresses */ int mc_count; /* Number of installed mcasts */ /*设置网络设备混杂模式计数器*/ int promiscuity; /*设置网络设备接收所有组播报的计数器,每次设置或是退出操作,该字段 都会相应的加或减1,为0时,网络设备才真正不再接收组播报*/ int allmulti; /* Protocol specific pointers */ void *atalk_ptr; /* AppleTalk link */ void *ip_ptr; /* IPv4 specific data */ void *dn_ptr; /* DECnet specific data */ void *ip6_ptr; /* IPv6 specific data */ void *ec_ptr; /* Econet specific data */ void *ax25_ptr; /* AX.25 specific data *//* * Cache line mostly used on receive path (including eth_type_trans()) */ /*该结构实例通过该字段连接到softnet_data的poll_list成员上*/ struct list_head poll_list ____cacheline_aligned_in_smp; /* Link to poll list */ /*轮询模式操作接口*/ int (*poll) (struct net_device *dev, int *quota); /*读取数据包的配额,动态变化,由netdev_budget初始化,每次从网络设备中读取数据包后, 会从中减去本次读取的数据包数,当该配额等于或小于0时,结束当前轮询等待下层轮询 这样即使某个网络设备有大量的数据包输入,也能保证其他网络设备能及时收到数据包 在输入时,遍历网络设备轮询队列,从选定的网络设备中读取数据包,一旦已经读取的数据 包的数量操作配额,即停止本次读取,将该网络设备移至网络设备轮询队列的队尾,等待 下次轮询*/ int quota; /*数据包输入软中断中,单个网络读取数据包的配额*/ int weight; unsigned long last_rx; /* Time of last Rx */ /* Interface address info used in eth_type_trans() */ unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast because most packets are unicast) */ unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add *//* * Cache line mostly used on queue transmit path (qdisc) */ /* device queue lock */ spinlock_t queue_lock ____cacheline_aligned_in_smp; /*当前使用的根排队规则,配置的排队规则生效时由qdisc_sleeping设置*/ struct Qdisc *qdisc; /*当前配置的排队规则,生效时将被设置到qdisc*/ struct Qdisc *qdisc_sleeping; /*通过链表方式记录配置所在网络的所有排队规则*/ struct list_head qdisc_list; /*可在设备发送队列中排队的最大数据包*/ unsigned long tx_queue_len; /* Max frames per queue allowed */ /* Partially transmitted GSO packet. */ struct sk_buff *gso_skb; /* ingress path synchronizer */ spinlock_t ingress_lock; /*数据包输入的排队规则*/ struct Qdisc *qdisc_ingress;/* * One part is mostly used on xmit path (device) */ /* hard_start_xmit synchronizer */ spinlock_t _xmit_lock ____cacheline_aligned_in_smp; /* cpu id of processor entered to hard_start_xmit or -1, if nobody entered there. */ int xmit_lock_owner; void *priv; /* pointer to private data */ /*驱动提供给上一层发送数据包的接口,在发送数据包时必定会调用该接口*/ int (*hard_start_xmit) (struct sk_buff *skb, struct net_device *dev); /* These may be needed for future network-power-down code. */ unsigned long trans_start; /* Time (in jiffies) of last Tx */ /*网络层确定传输已经超时,而调用驱动程序的tx_timeout接口的最短时间*/ int watchdog_timeo; /* used by dev_watchdog() */ /*用于检测网络设备处于正常的工作状态时,是否存在由于关闭队列功能 而导致发送超时的情况,一旦发生以上状况,就调用网络设备驱动的tx_timeout 接口处理*/ struct timer_list watchdog_timer;/* * refcnt is a very hot point, so align it on SMP */ /* Number of references to this device */ atomic_t refcnt ____cacheline_aligned_in_smp; /* delayed register/unregister */ /*用来连接net_todo_list链表,包含已经注销即将结束的网络设备*/ struct list_head todo_list; /* device index hash chain */ /*根据网络设备的索引,以散列表的形式组织到dev_index_hlist中*/ struct hlist_node index_hlist; /* register/unregister state machine */ enum { NETREG_UNINITIALIZED=0, NETREG_REGISTERED, /* completed register_netdevice */ NETREG_UNREGISTERING, /* called unregister_netdevice */ NETREG_UNREGISTERED, /* completed unregister todo */ NETREG_RELEASED, /* called free_netdev */ } reg_state; /* Called after device is detached from network. */ void (*uninit)(struct net_device *dev); /* Called after last user reference disappears. */ void (*destructor)(struct net_device *dev); /* Pointers to interface service routines. */ /*启用设备函数指针,完成那个注册所需的系统资源,打开硬件极其所有 设备*/ int (*open)(struct net_device *dev); int (*stop)(struct net_device *dev);#define HAVE_NETDEV_POLL/*根据先前检测到的源和目标硬件地址创建硬件首部*/ int (*hard_header) (struct sk_buff *skb, struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len);/*用来在传输包之前,ARP解析完成之后,重建硬件首部*/ int (*rebuild_header)(struct sk_buff *skb);#define HAVE_MULTICAST /*将组播地址列表更新到网络设备中*/ void (*set_multicast_list)(struct net_device *dev);#define HAVE_SET_MAC_ADDR /*修改硬件地址接口,需要网络设备支持该功能*/ int (*set_mac_address)(struct net_device *dev, void *addr);#define HAVE_PRIVATE_IOCTL int (*do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);#define HAVE_SET_CONFIG int (*set_config)(struct net_device *dev, struct ifmap *map);#define HAVE_HEADER_CACHE /*根据ARP查询的结果填充hh_cache结构*/ int (*hard_header_cache)(struct neighbour *neigh, struct hh_cache *hh); void (*header_cache_update)(struct hh_cache *hh, struct net_device *dev, unsigned char * haddr);#define HAVE_CHANGE_MTU int (*change_mtu)(struct net_device *dev, int new_mtu); #define HAVE_TX_TIMEOUT void (*tx_timeout) (struct net_device *dev); void (*vlan_rx_register)(struct net_device *dev, struct vlan_group *grp); void (*vlan_rx_add_vid)(struct net_device *dev, unsigned short vid); void (*vlan_rx_kill_vid)(struct net_device *dev, unsigned short vid); int (*hard_header_parse)(struct sk_buff *skb, unsigned char *haddr); /*设置邻居子系统相关的参数*/ int (*neigh_setup)(struct net_device *dev, struct neigh_parms *);#ifdef CONFIG_NETPOLL /*网络设备netpoll信息块*/ struct netpoll_info *npinfo;#endif#ifdef CONFIG_NET_POLL_CONTROLLER /*该函数在禁止中断的情况下,要求驱动程序以轮询模式在接口上查询事件*/ void (*poll_controller)(struct net_device *dev);#endif /* bridge stuff */ struct net_bridge_port *br_port; /* class/net/name entry */ struct class_device class_dev; /* space for optional statistics and wireless sysfs groups */ struct attribute_group *sysfs_groups[3];};网卡驱动的注册是在e100_init_modle中,static int __init e100_init_module(void){ if(((1 << debug) - 1) & NETIF_MSG_DRV) { printk(KERN_INFO PFX "%s, %s
", DRV_DESCRIPTION, DRV_VERSION); printk(KERN_INFO PFX "%s
", DRV_COPYRIGHT); } return pci_register_driver(&e100_driver);}可见,网卡驱动也就是和一般的PCI驱动编写一样。static struct pci_driver e100_driver = { .name = DRV_NAME, .id_table = e100_id_table, .probe = e100_probe, .remove = __devexit_p(e100_remove),#ifdef CONFIG_PM /* Power Management hooks */ .suspend = e100_suspend, .resume = e100_resume,#endif .shutdown = e100_shutdown, .err_handler = &e100_err_handler,}; 如果网络设备驱动程序被编译进内核,则将在启动时被初始化,在运行时作为模块被加载。无论初始化是否被发生,由驱动程序控制的网络设备都会被注册。这种情形适用于所有的总线类型,无论是总线体系结构还是模块初始??代码调用注册函数,结果都是一样的。PCI设备驱动程序加载以至执行pci_drive->probe()函数。我们看看e100网卡的驱动注册过程:static int __devinit e100_probe(struct pci_dev *pdev, const struct pci_device_id *ent){ struct net_device *netdev; struct nic *nic; int err; /*分配设备数据结构*/ if(!(netdev = alloc_etherdev(sizeof(struct nic)))) { if(((1 << debug) - 1) & NETIF_MSG_PROBE) printk(KERN_ERR PFX "Etherdev alloc failed, abort.
"); return -ENOMEM; } /*初始化设备*/ netdev->open = e100_open; netdev->stop = e100_close; /*e100网络设备的hard_start_xmit接口实现,最终将数据包输出到硬件*/ netdev->hard_start_xmit = e100_xmit_frame; netdev->get_stats = e100_get_stats; netdev->set_multicast_list = e100_set_multicast_list; netdev->set_mac_address = e100_set_mac_address; netdev->change_mtu = e100_change_mtu; netdev->do_ioctl = e100_do_ioctl; SET_ETHTOOL_OPS(netdev, &e100_ethtool_ops); netdev->tx_timeout = e100_tx_timeout; netdev->watchdog_timeo = E100_WATCHDOG_PERIOD; netdev->poll = e100_poll; netdev->weight = E100_NAPI_WEIGHT;#ifdef CONFIG_NET_POLL_CONTROLLER/*为了实现netpoll接收报文功能,需要实现下面的函数调用,该函数用来模拟网络设备发生中断,进行中断处理*/ netdev->poll_controller = e100_netpoll;#endif strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1); /*在alloc_etherdev中设置的私有属性,即结构nic,在这里提出来*/ nic = netdev_priv(netdev); /*初始化该nic*/ nic->netdev = netdev; nic->pdev = pdev; nic->msg_enable = (1 << debug) - 1; /*设置PCI设备私有数据为网络设备结构实例*/ pci_set_drvdata(pdev, netdev); /* Initialize device before it"s used by a driver. Ask low-level code * to enable I/O and memory. Wake up the device if it was suspended. * Beware, this function can fail.*/ if((err = pci_enable_device(pdev))) { DPRINTK(PROBE, ERR, "Cannot enable PCI device, aborting.
"); goto err_out_free_dev; } if(!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { DPRINTK(PROBE, ERR, "Cannot find proper PCI device " "base address, aborting.
"); err = -ENODEV; goto err_out_disable_pdev; } /*保留资源,包括I/O和内存*/ if((err = pci_request_regions(pdev, DRV_NAME))) { DPRINTK(PROBE, ERR, "Cannot obtain PCI resources, aborting.
"); goto err_out_disable_pdev; } /*DMA相关,探测设备的DMA能力,如果设备支持DMA, 返回0*/ if((err = pci_set_dma_mask(pdev, DMA_32BIT_MASK))) { DPRINTK(PROBE, ERR, "No usable DMA configuration, aborting.
"); goto err_out_free_res; } SET_MODULE_OWNER(netdev); SET_NETDEV_DEV(netdev, &pdev->dev); /*控制状态寄存器映射内存资源*/ nic->csr = ioremap(pci_resource_start(pdev, 0), sizeof(struct csr)); if(!nic->csr) { DPRINTK(PROBE, ERR, "Cannot map device registers, aborting.
"); err = -ENOMEM; goto err_out_free_res; } if(ent->driver_data) nic->flags |= ich; else nic->flags &= ~ich; /*初始化nic相关字段*/ e100_get_defaults(nic); /* locks must be initialized before calling hw_reset */ spin_lock_init(&nic->cb_lock); spin_lock_init(&nic->cmd_lock); spin_lock_init(&nic->mdio_lock); /* Reset the device before pci_set_master() in case device is in some * funky state and has an interrupt pending - hint: we don"t have the * interrupt handler registered yet. */ /*设备复位,写相关寄存器方式实现*/ e100_hw_reset(nic); /*启用设备*/ pci_set_master(pdev); /*初始化两个软件时钟*/ init_timer(&nic->watchdog); nic->watchdog.function = e100_watchdog; nic->watchdog.data = (unsigned long)nic; init_timer(&nic->blink_timer); nic->blink_timer.function = e100_blink_led; nic->blink_timer.data = (unsigned long)nic; /*初始化工作队列*/ INIT_WORK(&nic->tx_timeout_task, e100_tx_timeout_task); /*从DMA区分配*/ if((err = e100_alloc(nic))) { DPRINTK(PROBE, ERR, "Cannot alloc driver memory, aborting.
"); goto err_out_iounmap; } /*读取网卡的EEPROM。其中存放这网卡的MAC地址 */ if((err = e100_eeprom_load(nic))) goto err_out_free; /*初始化nic的物理信息*/ e100_phy_init(nic); memcpy(netdev->dev_addr, nic->eeprom, ETH_ALEN); memcpy(netdev->perm_addr, nic->eeprom, ETH_ALEN); /*验证网卡的MAC地址是否格式正确*/ if(!is_valid_ether_addr(netdev->perm_addr)) { DPRINTK(PROBE, ERR, "Invalid MAC address from " "EEPROM, aborting.
"); err = -EAGAIN; goto err_out_free; } /* Wol magic packet can be enabled from eeprom */ if((nic->mac >= mac_82558_D101_A4) && (nic->eeprom[eeprom_id] & eeprom_id_wol)) nic->flags |= wol_magic; /* ack any pending wake events, disable PME */ /*,这个函数的第二个参数表示一种电源状态 PME#就是Power Management Event Signal,即电源管理事件信号.)PME#信号是PCI Power Spec中出镜率最高的一个名词.如果一个设备希望改变它的电源状态,它就可以发送一个PME#信号.而设备是否允许发送信号也是有开关的,并且每种状态都有一个开关 第三个参数是表示开还是关.即传递1进去就是enable,传递0进去就是disable*/ err = pci_enable_wake(pdev, 0, 0); if (err) DPRINTK(PROBE, ERR, "Error clearing wake event
"); /*网络设备的名称前加上eth*/ strcpy(netdev->name, "eth%d"); /*注册网络设备*/ if((err = register_netdev(netdev))) { DPRINTK(PROBE, ERR, "Cannot register net device, aborting.
"); goto err_out_free; } DPRINTK(PROBE, INFO, "addr 0x%llx, irq %d, " "MAC addr %02X:%02X:%02X:%02X:%02X:%02X
", (unsigned long long)pci_resource_start(pdev, 0), pdev->irq, netdev->dev_addr[0], netdev->dev_addr[1], netdev->dev_addr[2], netdev->dev_addr[3], netdev->dev_addr[4], netdev->dev_addr[5]); return 0;err_out_free: e100_free(nic);err_out_iounmap: iounmap(nic->csr);err_out_free_res: pci_release_regions(pdev);err_out_disable_pdev: pci_disable_device(pdev);err_out_free_dev: pci_set_drvdata(pdev, NULL); free_netdev(netdev); return err;}其辅助函数:分配网络设备结构/*传入的参数为nic结构的大小*/struct net_device *alloc_etherdev(int sizeof_priv){ return alloc_netdev(sizeof_priv, "eth%d", ether_setup);}struct net_device *alloc_netdev(int sizeof_priv, const char *name, void (*setup)(struct net_device *)){ void *p; struct net_device *dev; int alloc_size; BUG_ON(strlen(name) >= sizeof(dev->name)); /* ensure 32-byte alignment of both the device and private area */ /*计算分配的大小为设备结构大小加上nic结构大小*/ alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; /*分配空间*/ p = kzalloc(alloc_size, GFP_KERNEL); if (!p) { printk(KERN_ERR "alloc_netdev: Unable to allocate device.
"); return NULL; } dev = (struct net_device *) (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); /*计算padd大小为结构大小减去对其的数据大小*/ dev->padded = (char *)dev - (char *)p; if (sizeof_priv) /*私有数据为nic结构的起始地址*/ dev->priv = netdev_priv(dev); /*调用参数中的函数指针,初始化设备结构*/ setup(dev); strcpy(dev->name, name); return dev;}/*分配设备结构时调用,用于初始化该设备结构*/void ether_setup(struct net_device *dev){ dev->change_mtu = eth_change_mtu; dev->hard_header = eth_header; dev->rebuild_header = eth_rebuild_header; dev->set_mac_address = eth_mac_addr; dev->hard_header_cache = eth_header_cache; dev->header_cache_update= eth_header_cache_update; dev->hard_header_parse = eth_header_parse; dev->type = ARPHRD_ETHER; dev->hard_header_len = ETH_HLEN; dev->mtu = ETH_DATA_LEN; dev->addr_len = ETH_ALEN; dev->tx_queue_len = 1000; /* Ethernet wants good queues */ dev->flags = IFF_BROADCAST|IFF_MULTICAST; memset(dev->broadcast, 0xFF, ETH_ALEN);}注册网络设备的实际操作由register_netdev(netdev)调用register_netdevice()完成int register_netdevice(struct net_device *dev){ struct hlist_head *head; struct hlist_node *p; int ret; BUG_ON(dev_boot_phase); ASSERT_RTNL(); /*2.6内核支持内核抢占,该函数检查是否需要从新调度 如果是,则进行调度,无论此时进行执行在内核空间还是 用户空间*/ might_sleep(); /*初始化设备的各个字段*/ /* When net_device"s are persistent, this will be fatal. */ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); spin_lock_init(&dev->queue_lock); spin_lock_init(&dev->_xmit_lock); dev->xmit_lock_owner = -1;#ifdef CONFIG_NET_CLS_ACT spin_lock_init(&dev->ingress_lock);#endif dev->iflink = -1; /* Init, if this function is available */ /*如果有init函数,调用该函数进行初始化*/ if (dev->init) { ret = dev->init(dev); if (ret) { if (ret > 0) ret = -EIO; goto out; }