linux内核源码阅读之facebook硬盘加速flashcache之七
这一节讲缓存的写回磁盘流程。这里隆重介绍一下两位幕后的英雄:
1004/*1005 * Clean dirty blocks in this set as needed.1006 *1007 * 1) Select the n blocks that we want to clean (choosing whatever policy), sort them.1008 * 2) Then sweep the entire set looking for other DIRTY blocks that can be tacked onto1009 * any of these blocks to form larger contigous writes. The idea here is that if you1010 * are going to do a write anyway, then we might as well opportunistically write out1011 * any contigous blocks for free (Bob's idea).1012 */1013void1014flashcache_clean_set(struct cache_c *dmc, int set)1015{1016 unsigned long flags;1017 int to_clean = 0;1018 struct dbn_index_pair *writes_list;1019 int nr_writes = 0;1020 int start_index = set * dmc->assoc;1021 1022 /*1023 * If a (fast) removal of this device is in progress, don't kick off1024 * any more cleanings. This isn't sufficient though. We still need to1025 * stop cleanings inside flashcache_dirty_writeback() because we could1026 * have started a device remove after tested this here.1027 */1028 if (atomic_read(&dmc->fast_remove_in_prog))1029 return;1030 writes_list = kmalloc(dmc->assoc * sizeof(struct dbn_index_pair), GFP_NOIO);1031 if (unlikely(sysctl_flashcache_error_inject & WRITES_LIST_ALLOC_FAIL)) {1032 if (writes_list)1033 kfree(writes_list);1034 writes_list = NULL;1035 sysctl_flashcache_error_inject &= ~WRITES_LIST_ALLOC_FAIL;1036 }1037 if (writes_list == NULL) {1038 dmc->memory_alloc_errors++;1039 return;1040 }1041 dmc->clean_set_calls++;1042 spin_lock_irqsave(&dmc->cache_spin_lock, flags);1043 if (dmc->cache_sets[set].nr_dirty < dmc->dirty_thresh_set) {1044 dmc->clean_set_less_dirty++;1045 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);1046 kfree(writes_list);1047 return;1048 } else1049 to_clean = dmc->cache_sets[set].nr_dirty - dmc->dirty_thresh_set;1050 if (sysctl_flashcache_reclaim_policy == FLASHCACHE_FIFO) {1051 int i, scanned;1052 int start_index, end_index;10531054 start_index = set * dmc->assoc;1055 end_index = start_index + dmc->assoc;1056 scanned = 0;1057 i = dmc->cache_sets[set].set_clean_next;1058 DPRINTK("flashcache_clean_set: Set %d", set);1059 while (scanned < dmc->assoc &&1060 ((dmc->cache_sets[set].clean_inprog + nr_writes) < dmc->max_clean_ios_set) &&1061 ((nr_writes + dmc->clean_inprog) < dmc->max_clean_ios_total) &&1062 nr_writes < to_clean) {1063 if ((dmc->cache[i].cache_state & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) { 1064 dmc->cache[i].cache_state |= DISKWRITEINPROG;1065 writes_list[nr_writes].dbn = dmc->cache[i].dbn;1066 writes_list[nr_writes].index = i;1067 nr_writes++;1068 }1069 scanned++;1070 i++;1071 if (i == end_index)1072 i = start_index;1073 }1074 dmc->cache_sets[set].set_clean_next = i;1075 } else { /* flashcache_reclaim_policy == FLASHCACHE_LRU */1076 struct cacheblock *cacheblk;1077 int lru_rel_index;10781079 lru_rel_index = dmc->cache_sets[set].lru_head;1080 while (lru_rel_index != FLASHCACHE_LRU_NULL &&1081 ((dmc->cache_sets[set].clean_inprog + nr_writes) < dmc->max_clean_ios_set) &&1082 ((nr_writes + dmc->clean_inprog) < dmc->max_clean_ios_total) &&1083 nr_writes < to_clean) {1084 cacheblk = &dmc->cache[lru_rel_index + start_index]; 1085 if ((cacheblk->cache_state & (DIRTY | BLOCK_IO_INPROG)) == DIRTY) {1086 cacheblk->cache_state |= DISKWRITEINPROG;1087 writes_list[nr_writes].dbn = cacheblk->dbn;1088 writes_list[nr_writes].index = cacheblk - &dmc->cache[0];1089 nr_writes++;1090 }1091 lru_rel_index = cacheblk->lru_next;1092 }1093 }1094 if (nr_writes > 0) {1095 int i;10961097 flashcache_merge_writes(dmc, writes_list, &nr_writes, set);1098 dmc->clean_set_ios += nr_writes;1099 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);1100 for (i = 0 ; i < nr_writes ; i++)1101 flashcache_dirty_writeback(dmc, writes_list[i].index);1102 } else {1103 int do_delayed_clean = 0;11041105 if (dmc->cache_sets[set].nr_dirty > dmc->dirty_thresh_set)1106 do_delayed_clean = 1;1107 spin_unlock_irqrestore(&dmc->cache_spin_lock, flags);1108 if (dmc->cache_sets[set].clean_inprog >= dmc->max_clean_ios_set)1109 dmc->set_limit_reached++;1110 if (dmc->clean_inprog >= dmc->max_clean_ios_total)1111 dmc->total_limit_reached++;1112 if (do_delayed_clean)1113 schedule_delayed_work(&dmc->delayed_clean, 1*HZ);1114 dmc->clean_set_fails++;1115 }1116 kfree(writes_list);1117}
先看输入参数,一个是dmc,另一个是set,集合下标。1028行判断是否快速移除,如果是就不做任何处理。1030申请写记录内存,结构struct dbn_index_pair刚刚已经看过了。1031行,这个是用于测试用的,故意设置申请内存申请失败的情况下程序是否能正确运行。1037行,申请不到内存就返回。1043行,判断集合脏块是否达到水位线,没有达到水位线就不用频繁去刷了。1049行,计算出当前需要刷的脏块数。1050行,如果当前刷脏块策略为FIFO,则按照FIFO遍历集合,记录脏块信息。1075行,是LRU策略。这两个策略没有优劣之分,只有说在某种应用下哪种策略更适合。那么还有其他可以比较的吗?来看一下两种策略的内存开销吧。FIFO的开销是在每个集合管理结构cache_set中增加一个set_clean_next、set_fifo_next字段。LRU的开销是集合中有lru_head, lru_tail,cache块中还有lru_prev, lru_next。注意这里的lru_prev, lru_next都用16位无符号数来表示,在64位系统中每个字段节省了48个位。带来的负面作用是每个集合中最多可以有2的16次方个cache块。在某些应用中用下标表示会远比指针表示来得优越。在我曾经做过的一个项目中,要求在程序异常时能够立即恢复并且不影响正在使用服务的客户,这个时候就要求程序重新启动的时候要完全恢复到原来运行的状态,那么就要求需要恢复的数据通通不能用到指针,因为程序重启后这些指针都已经无效了,这个时候下标表示就派上用场了。在获取脏块记录之后,在1094行下发脏块。1102行,有两种情况一是没有脏块,二是下发脏块达到上限,第二种情况下到1113行隔1秒再调度一次刷脏块。讲到这里,似乎已经把所有的系统都遍历一遍了。然而我们一直是太乐观了,因为还有更重要的好戏还等着我们去探索。