读书人

linux内核源码阅览之facebook硬盘加速

发布时间: 2013-09-22 09:32:58 作者: rapoo

linux内核源码阅读之facebook硬盘加速flashcache之七

这一节讲缓存的写回磁盘流程。这里隆重介绍一下两位幕后的英雄:

1004/*1005 * Clean dirty blocks in this set as needed.1006 *1007 * 1) Select the n blocks that we want to clean (choosing whatever policy), sort them.1008 * 2) Then sweep the entire set looking for other DIRTY blocks that can be tacked onto1009 * any of these blocks to form larger contigous writes. The idea here is that if you1010 * are going to do a write anyway, then we might as well opportunistically write out1011 * any contigous blocks for free (Bob's idea).1012 */1013void1014flashcache_clean_set(struct cache_c *dmc, int set)1015{1016     unsigned long flags;1017     int to_clean = 0;1018     struct dbn_index_pair *writes_list;1019     int nr_writes = 0;1020     int start_index = set * dmc->assoc;1021    1022     /*1023     * If a (fast) removal of this device is in progress, don't kick off1024     * any more cleanings. This isn't sufficient though. We still need to1025     * stop cleanings inside flashcache_dirty_writeback() because we could1026     * have started a device remove after tested this here.1027     */1028     if (atomic_read(&dmc->fast_remove_in_prog))1029          return;1030     writes_list = kmalloc(dmc->assoc * sizeof(struct dbn_index_pair), GFP_NOIO);1031     if (unlikely(sysctl_flashcache_error_inject & WRITES_LIST_ALLOC_FAIL)) {1032          if (writes_list)1033               kfree(writes_list);1034          writes_list = NULL;1035          sysctl_flashcache_error_inject &= ~WRITES_LIST_ALLOC_FAIL;1036     }1037     if (writes_list == NULL) {1038          dmc->memory_alloc_errors++;1039          return;1040     }1041     dmc->clean_set_calls++;1042     spin_lock_irqsave(&dmc->cache_spin_lock, flags);1043     if (dmc->cache_sets[set].nr_dirty < dmc->dirty_thresh_set) {1044          dmc->clean_set_less_dirty++;1045          spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);1046          kfree(writes_list);1047          return;1048     } else1049          to_clean = dmc->cache_sets[set].nr_dirty - dmc->dirty_thresh_set;1050     if (sysctl_flashcache_reclaim_policy == FLASHCACHE_FIFO) {1051          int i, scanned;1052          int start_index, end_index;10531054          start_index = set * dmc->assoc;1055          end_index = start_index + dmc->assoc;1056          scanned = 0;1057          i = dmc->cache_sets[set].set_clean_next;1058          DPRINTK("flashcache_clean_set: Set %d", set);1059          while (scanned < dmc->assoc &&1060                 ((dmc->cache_sets[set].clean_inprog + nr_writes) < dmc->max_clean_ios_set) &&1061                 ((nr_writes + dmc->clean_inprog) < dmc->max_clean_ios_total) &&1062                 nr_writes < to_clean) {1063               if ((dmc->cache[i].cache_state & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) {    1064                    dmc->cache[i].cache_state |= DISKWRITEINPROG;1065                    writes_list[nr_writes].dbn = dmc->cache[i].dbn;1066                    writes_list[nr_writes].index = i;1067                    nr_writes++;1068               }1069               scanned++;1070               i++;1071               if (i == end_index)1072                    i = start_index;1073          }1074          dmc->cache_sets[set].set_clean_next = i;1075     } else { /* flashcache_reclaim_policy == FLASHCACHE_LRU */1076          struct cacheblock *cacheblk;1077          int lru_rel_index;10781079          lru_rel_index = dmc->cache_sets[set].lru_head;1080          while (lru_rel_index != FLASHCACHE_LRU_NULL &&1081                 ((dmc->cache_sets[set].clean_inprog + nr_writes) < dmc->max_clean_ios_set) &&1082                 ((nr_writes + dmc->clean_inprog) < dmc->max_clean_ios_total) &&1083                 nr_writes < to_clean) {1084               cacheblk = &dmc->cache[lru_rel_index + start_index];              1085               if ((cacheblk->cache_state & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) {1086                    cacheblk->cache_state |= DISKWRITEINPROG;1087                    writes_list[nr_writes].dbn = cacheblk->dbn;1088                    writes_list[nr_writes].index = cacheblk - &dmc->cache[0];1089                    nr_writes++;1090               }1091               lru_rel_index = cacheblk->lru_next;1092          }1093     }1094     if (nr_writes > 0) {1095          int i;10961097          flashcache_merge_writes(dmc, writes_list, &nr_writes, set);1098          dmc->clean_set_ios += nr_writes;1099          spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);1100          for (i = 0 ; i < nr_writes ; i++)1101               flashcache_dirty_writeback(dmc, writes_list[i].index);1102     } else {1103          int do_delayed_clean = 0;11041105          if (dmc->cache_sets[set].nr_dirty > dmc->dirty_thresh_set)1106               do_delayed_clean = 1;1107          spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);1108          if (dmc->cache_sets[set].clean_inprog >= dmc->max_clean_ios_set)1109               dmc->set_limit_reached++;1110          if (dmc->clean_inprog >= dmc->max_clean_ios_total)1111               dmc->total_limit_reached++;1112          if (do_delayed_clean)1113               schedule_delayed_work(&dmc->delayed_clean, 1*HZ);1114          dmc->clean_set_fails++;1115     }1116     kfree(writes_list);1117}

先看输入参数,一个是dmc,另一个是set,集合下标。1028行判断是否快速移除,如果是就不做任何处理。1030申请写记录内存,结构struct dbn_index_pair刚刚已经看过了。1031行,这个是用于测试用的,故意设置申请内存申请失败的情况下程序是否能正确运行。1037行,申请不到内存就返回。1043行,判断集合脏块是否达到水位线,没有达到水位线就不用频繁去刷了。1049行,计算出当前需要刷的脏块数。1050行,如果当前刷脏块策略为FIFO,则按照FIFO遍历集合,记录脏块信息。1075行,是LRU策略。这两个策略没有优劣之分,只有说在某种应用下哪种策略更适合。那么还有其他可以比较的吗?来看一下两种策略的内存开销吧。FIFO的开销是在每个集合管理结构cache_set中增加一个set_clean_next、set_fifo_next字段。LRU的开销是集合中有lru_head, lru_tail,cache块中还有lru_prev, lru_next。注意这里的lru_prev, lru_next都用16位无符号数来表示,在64位系统中每个字段节省了48个位。带来的负面作用是每个集合中最多可以有2的16次方个cache块。在某些应用中用下标表示会远比指针表示来得优越。在我曾经做过的一个项目中,要求在程序异常时能够立即恢复并且不影响正在使用服务的客户,这个时候就要求程序重新启动的时候要完全恢复到原来运行的状态,那么就要求需要恢复的数据通通不能用到指针,因为程序重启后这些指针都已经无效了,这个时候下标表示就派上用场了。在获取脏块记录之后,在1094行下发脏块。1102行,有两种情况一是没有脏块,二是下发脏块达到上限,第二种情况下到1113行隔1秒再调度一次刷脏块。讲到这里,似乎已经把所有的系统都遍历一遍了。然而我们一直是太乐观了,因为还有更重要的好戏还等着我们去探索。

读书人网 >Flash

热点推荐