看了ceph的关于snap的一些代码(snap.cc、snaprealm.cc、snapserver.cc、snapmapper.cc),作了标注,在此记录。

snap.cc主要是通过encode(序列化),将数据结构表示为二进制流的方式,decode(反序列化),以便通过网络传输或保存在磁盘等存储介质上。

以下是snaprealm.h的一些结构体的声明和注释

1 struct SnapRealm {
 2   // realm state
 3 
 4   sr_t srnode;
 5 
 6   // in-memory state
 7   MDCache *mdcache;
 8   CInode *inode;
 9 
10   bool open;                        // set to true once all past_parents are opened
11   SnapRealm *parent;
12   set<SnapRealm*> open_children;    // active children that are currently open
13   map<inodeno_t,SnapRealm*> open_past_parents;  // these are explicitly pinned.
14 
15   // cache
16   snapid_t cached_seq;           // max seq over self and all past+present parents.
17   snapid_t cached_last_created;  // max last_created over all past+present parents
18   snapid_t cached_last_destroyed;
19   set<snapid_t> cached_snaps;
20   SnapContext cached_snap_context;
21 
22   bufferlist cached_snap_trace;
23 
24   elist<CInode*> inodes_with_caps;             // for efficient realm splits
25   map<client_t, xlist<Capability*>* > client_caps;   // to identify clients who need snap notifications

snaprealm.cc:

1 #include "SnapRealm.h"
  2 #include "MDCache.h"
  3 #include "MDS.h"
  4 
  5 #include "messages/MClientSnap.h"
  6 
  7 
  8 /*
  9  * SnapRealm
 10  */
 11 
 12 #define dout_subsys ceph_subsys_mds
 13 #undef dout_prefix
 14 #define dout_prefix _prefix(_dout, mdcache->mds->get_nodeid(), inode, srnode.seq, this)
 15 static ostream& _prefix(std::ostream *_dout, int whoami, CInode *inode,
 16             uint64_t seq, SnapRealm *realm) {
 17   return *_dout << " mds." << whoami
 18         << ".cache.snaprealm(" << inode->ino()
 19         << " seq " << seq << " " << realm << ") ";
 20 }
 21 
 22 ostream& operator<<(ostream& out, const SnapRealm& realm) 
 23 {
 24   out << "snaprealm(" << realm.inode->ino()
 25       << " seq " << realm.srnode.seq
 26       << " lc " << realm.srnode.last_created
 27       << " cr " << realm.srnode.created;
 28   if (realm.srnode.created != realm.srnode.current_parent_since)
 29     out << " cps " << realm.srnode.current_parent_since;
 30   out << " snaps=" << realm.srnode.snaps;
 31   if (realm.srnode.past_parents.size()) {
 32     out << " past_parents=(";
 33     for (map<snapid_t, snaplink_t>::const_iterator p = realm.srnode.past_parents.begin(); 
 34      p != realm.srnode.past_parents.end(); 
 35      ++p) {
 36       if (p != realm.srnode.past_parents.begin()) out << ",";
 37       out << p->second.first << "-" << p->first
 38       << "=" << p->second.ino;
 39     }
 40     out << ")";
 41   }
 42   out << " " << &realm << ")";
 43   return out;
 44 }
 45 
 46 //添加parent到open_past_parents的map容器中
 47 void SnapRealm::add_open_past_parent(SnapRealm *parent)
 48 {
 49   open_past_parents[parent->inode->ino()] = parent;
 50   parent->inode->get(CInode::PIN_PASTSNAPPARENT);        //标记为CInode::PIN_PASTSNAPPARENT
 51 }
 52 
 53 struct C_SR_RetryOpenParents : public MDSInternalContextBase {
 54   SnapRealm *sr;
 55   snapid_t first, last, parent_last;
 56   inodeno_t parent;
 57   MDSInternalContextBase* fin;
 58   C_SR_RetryOpenParents(SnapRealm *s, snapid_t f, snapid_t l, snapid_t pl,
 59             inodeno_t p, MDSInternalContextBase *c) :
 60     sr(s), first(f), last(l), parent_last(pl),  parent(p), fin(c) {}
 61   MDS *get_mds() { return sr->mdcache->mds; }
 62   void finish(int r) {
 63     if (r < 0)
 64       sr->_remove_missing_parent(parent_last, parent, r);
 65     if (sr->_open_parents(fin, first, last))
 66       fin->complete(0);
 67   }
 68 };
 69 
 70 //删除找不到snapid的parent
 71 void SnapRealm::_remove_missing_parent(snapid_t snapid, inodeno_t parent, int err)
 72 {
 73   map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.find(snapid);
 74   if (p != srnode.past_parents.end()) {
 75     dout(10) << __func__ << " " << parent << " [" << p->second.first << ","
 76          << p->first << "]  errno " << err << dendl;
 77     srnode.past_parents.erase(p);
 78   } else {
 79     dout(10) << __func__ << " " << parent << " not found" << dendl;
 80   }
 81 }
 82 
 83 //判定parents是否open,返回true or false
 84 bool SnapRealm::_open_parents(MDSInternalContextBase *finish, snapid_t first, snapid_t last)
 85 {
 86   dout(10) << "open_parents [" << first << "," << last << "]" << dendl;
 87   if (open) 
 88     return true;
 89 
 90   // make sure my current parents' parents are open...
 91   if (parent) {
 92     dout(10) << " current parent [" << srnode.current_parent_since << ",head] is " << *parent
 93          << " on " << *parent->inode << dendl;
 94     if (last >= srnode.current_parent_since &&
 95     !parent->_open_parents(finish, MAX(first, srnode.current_parent_since), last))
 96       return false;
 97   }
 98 
 99   // and my past parents too!
100   assert(srnode.past_parents.size() >= open_past_parents.size());
101   if (srnode.past_parents.size() > open_past_parents.size()) {
102     for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.begin();
103      p != srnode.past_parents.end();
104      ++p) {    
105       dout(10) << " past_parent [" << p->second.first << "," << p->first << "] is "
106            << p->second.ino << dendl;
107       CInode *parent = mdcache->get_inode(p->second.ino);
108       if (!parent) {
109     C_SR_RetryOpenParents *fin = new C_SR_RetryOpenParents(this, first, last, p->first,
110                                    p->second.ino, finish);
111     mdcache->open_ino(p->second.ino, mdcache->mds->mdsmap->get_metadata_pool(), fin);
112     return false;
113       }
114       assert(parent->snaprealm);  // hmm!
115       if (!parent->snaprealm->_open_parents(finish, p->second.first, p->first))
116     return false;
117       if (!open_past_parents.count(p->second.ino)) {
118     add_open_past_parent(parent->snaprealm);
119       }
120     }
121   }
122 
123   open = true;
124   return true;
125 }
126 
127 //判断从first到last的snap的past_parents是否open
128 bool SnapRealm::have_past_parents_open(snapid_t first, snapid_t last)    
129 {
130   dout(10) << "have_past_parents_open [" << first << "," << last << "]" << dendl;
131   if (open)
132     return true;
133 
134   for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first);
135        p != srnode.past_parents.end();
136        ++p) {
137     if (p->second.first > last)
138       break;
139     dout(10) << " past parent [" << p->second.first << "," << p->first << "] was "
140          << p->second.ino << dendl;
141     if (open_past_parents.count(p->second.ino) == 0) {
142       dout(10) << " past parent " << p->second.ino << " is not open" << dendl;
143       return false;
144     }
145     if (!open_past_parents[p->second.ino]->have_past_parents_open(MAX(first, p->second.first),
146                                   MIN(last, p->first)))
147       return false;
148   }
149 
150   open = true;
151   return true;
152 }
153 
154 
155 void SnapRealm::close_parents()
156 {
157   for (map<inodeno_t,SnapRealm*>::iterator p = open_past_parents.begin();
158        p != open_past_parents.end();
159        ++p)
160     p->second->inode->put(CInode::PIN_PASTSNAPPARENT);   //解除标记CInode::PIN_PASTSNAPPARENT
161   open_past_parents.clear();  //清空
162 }
163 
164 
165 /*
166  * get list of snaps for this realm.  we must include parents' snaps
167  * for the intervals during which they were our parent.
168  */
169  //将realm中的snaps取出并存入第一个参数set容器中
170 void SnapRealm::build_snap_set(set<snapid_t> &s,
171                    snapid_t& max_seq, snapid_t& max_last_created, snapid_t& max_last_destroyed,
172                    snapid_t first, snapid_t last)
173 {
174   dout(10) << "build_snap_set [" << first << "," << last << "] on " << *this << dendl;
175 
176   if (srnode.seq > max_seq)
177     max_seq = srnode.seq;
178   if (srnode.last_created > max_last_created)
179     max_last_created = srnode.last_created;
180   if (srnode.last_destroyed > max_last_destroyed)
181     max_last_destroyed = srnode.last_destroyed;
182 
183   // include my snaps within interval [first,last]
184   for (map<snapid_t, SnapInfo>::iterator p = srnode.snaps.lower_bound(first); // first element >= first
185        p != srnode.snaps.end() && p->first <= last;
186        ++p)
187     s.insert(p->first);
188 
189   // include snaps for parents during intervals that intersect [first,last]
190   for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first);
191        p != srnode.past_parents.end() && p->first >= first && p->second.first <= last;
192        ++p) {
193     CInode *oldparent = mdcache->get_inode(p->second.ino);
194     assert(oldparent);  // call open_parents first!
195     assert(oldparent->snaprealm);
196     oldparent->snaprealm->build_snap_set(s, max_seq, max_last_created, max_last_destroyed,
197                      MAX(first, p->second.first),
198                      MIN(last, p->first));           //递归,将past_parent的snapid加入set容器
199   }
200   if (srnode.current_parent_since <= last && parent)
201     parent->build_snap_set(s, max_seq, max_last_created, max_last_destroyed,
202                MAX(first, srnode.current_parent_since), last);         //递归,将current_parent的snapid加入set容器
203 }
204 
205 
206 //检查cache中的数据结构并更新cached_snaps和cached_snap_trace
207 void SnapRealm::check_cache()
208 {
209   if (cached_seq >= srnode.seq)
210     return;
211 
212   cached_snaps.clear();
213   cached_snap_context.clear();
214 
215   cached_last_created = srnode.last_created;
216   cached_last_destroyed = srnode.last_destroyed;
217   cached_seq = srnode.seq;
218   build_snap_set(cached_snaps, cached_seq, cached_last_created, cached_last_destroyed,
219          0, CEPH_NOSNAP);
220 
221   cached_snap_trace.clear();
222   build_snap_trace(cached_snap_trace);
223   
224   dout(10) << "check_cache rebuilt " << cached_snaps
225        << " seq " << srnode.seq
226        << " cached_seq " << cached_seq
227        << " cached_last_created " << cached_last_created
228        << " cached_last_destroyed " << cached_last_destroyed
229        << ")" << dendl;
230 }
231 
232 
233 const set<snapid_t>& SnapRealm::get_snaps()
234 {
235   check_cache();
236   dout(10) << "get_snaps " << cached_snaps
237        << " (seq " << srnode.seq << " cached_seq " << cached_seq << ")"
238        << dendl;
239   return cached_snaps;
240 }
241 
242 /*
243  * build vector in reverse sorted order
244  */
245  //循环将存储snapid的set中的内容放入返回值cached_snap_context中
246 const SnapContext& SnapRealm::get_snap_context()
247 {
248   check_cache();
249 
250   if (!cached_snap_context.seq) {
251     cached_snap_context.seq = cached_seq;
252     cached_snap_context.snaps.resize(cached_snaps.size());
253     unsigned i = 0;
254     for (set<snapid_t>::reverse_iterator p = cached_snaps.rbegin();
255      p != cached_snaps.rend();
256      ++p)
257       cached_snap_context.snaps[i++] = *p;
258   }
259 
260   return cached_snap_context;
261 }
262 
263 //得到cache_snap,并将其info存入infomap的map容器中
264 void SnapRealm::get_snap_info(map<snapid_t,SnapInfo*>& infomap, snapid_t first, snapid_t last)
265 {
266   const set<snapid_t>& snaps = get_snaps();
267   dout(10) << "get_snap_info snaps " << snaps << dendl;
268 
269   // include my snaps within interval [first,last]
270   for (map<snapid_t, SnapInfo>::iterator p = srnode.snaps.lower_bound(first); // first element >= first
271        p != srnode.snaps.end() && p->first <= last;
272        ++p)
273     infomap[p->first] = &p->second;    //令键为p->first的infomap的值等于&p->second
274 
275   // include snaps for parents during intervals that intersect [first,last]
276   for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first);
277        p != srnode.past_parents.end() && p->first >= first && p->second.first <= last;
278        ++p) {
279     CInode *oldparent = mdcache->get_inode(p->second.ino);
280     assert(oldparent);  // call open_parents first!
281     assert(oldparent->snaprealm);
282     oldparent->snaprealm->get_snap_info(infomap,
283                     MAX(first, p->second.first),
284                     MIN(last, p->first));
285   }
286   if (srnode.current_parent_since <= last && parent)
287     parent->get_snap_info(infomap, MAX(first, srnode.current_parent_since), last);
288 }
289 
290 const string& SnapRealm::get_snapname(snapid_t snapid, inodeno_t atino)
291 {
292   if (srnode.snaps.count(snapid)) {
293     if (atino == inode->ino())
294       return srnode.snaps[snapid].name;
295     else
296       return srnode.snaps[snapid].get_long_name();
297   }
298 
299   map<snapid_t,snaplink_t>::iterator p = srnode.past_parents.lower_bound(snapid);
300   if (p != srnode.past_parents.end() && p->second.first <= snapid) {
301     CInode *oldparent = mdcache->get_inode(p->second.ino);
302     assert(oldparent);  // call open_parents first!
303     assert(oldparent->snaprealm);    
304     return oldparent->snaprealm->get_snapname(snapid, atino);
305   }
306 
307   assert(srnode.current_parent_since <= snapid);
308   assert(parent);
309   return parent->get_snapname(snapid, atino);
310 }
311 
312 //判断第一个参数n,在srnode.snaps从first到last范围内找出对应的snapid
313 snapid_t SnapRealm::resolve_snapname(const string& n, inodeno_t atino, snapid_t first, snapid_t last)
314 {
315   // first try me
316   dout(10) << "resolve_snapname '" << n << "' in [" << first << "," << last << "]" << dendl;
317 
318   //snapid_t num;
319   //if (n[0] == '~') num = atoll(n.c_str()+1);
320 
321   bool actual = (atino == inode->ino());
322   string pname;
323   inodeno_t pino;
324   if (!actual) {
325     if (!n.length() ||
326     n[0] != '_') return 0;
327     int next_ = n.find('_', 1);
328     if (next_ < 0) return 0;
329     pname = n.substr(1, next_ - 1);
330     pino = atoll(n.c_str() + next_ + 1);
331     dout(10) << " " << n << " parses to name '" << pname << "' dirino " << pino << dendl;
332   }
333 
334 //根据snapinfo得到snapid
335   for (map<snapid_t, SnapInfo>::iterator p = srnode.snaps.lower_bound(first); // first element >= first
336        p != srnode.snaps.end() && p->first <= last;
337        ++p) {
338     dout(15) << " ? " << p->second << dendl;
339     //if (num && p->second.snapid == num)
340     //return p->first;
341     if (actual && p->second.name == n)
342     return p->first;
343     if (!actual && p->second.name == pname && p->second.ino == pino)
344       return p->first;
345   }
346 
347     // include snaps for parents during intervals that intersect [first,last]
348   for (map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.lower_bound(first);
349        p != srnode.past_parents.end() && p->first >= first && p->second.first <= last;
350        ++p) {
351     CInode *oldparent = mdcache->get_inode(p->second.ino);
352     assert(oldparent);  // call open_parents first!
353     assert(oldparent->snaprealm);
354     snapid_t r = oldparent->snaprealm->resolve_snapname(n, atino,
355                             MAX(first, p->second.first),
356                             MIN(last, p->first));
357     if (r)
358       return r;
359   }
360   if (parent && srnode.current_parent_since <= last)
361     return parent->resolve_snapname(n, atino, MAX(first, srnode.current_parent_since), last);
362   return 0;
363 }
364 
365 
366 //调整parent令变量parent等于当前parent
367 void SnapRealm::adjust_parent()
368 {
369   SnapRealm *newparent = inode->get_parent_dn()->get_dir()->get_inode()->find_snaprealm();
370   if (newparent != parent) {
371     dout(10) << "adjust_parent " << parent << " -> " << newparent << dendl;
372     if (parent)
373       parent->open_children.erase(this);
374     parent = newparent;
375     if (parent)
376       parent->open_children.insert(this);
377     
378     invalidate_cached_snaps();
379   }
380 }
381 
382 //将child分离出当前realm
383 void SnapRealm::split_at(SnapRealm *child)
384 {
385   dout(10) << "split_at " << *child 
386        << " on " << *child->inode << dendl;
387 
388   if (inode->is_mdsdir() || !child->inode->is_dir()) {
389     // it's not a dir.
390     if (child->inode->containing_realm) {
391       //  - no open children.
392       //  - only need to move this child's inode's caps.
393       child->inode->move_to_realm(child);           //脱离containing_realm,加入到child realm
394     } else {
395       // no caps, nothing to move/split.
396       dout(20) << " split no-op, no caps to move on file " << *child->inode << dendl;
397       assert(!child->inode->is_any_caps());
398     }
399     return;
400   }
401 
402   // it's a dir.
403 
404   // split open_children
405   dout(10) << " open_children are " << open_children << dendl;
406   for (set<SnapRealm*>::iterator p = open_children.begin();
407        p != open_children.end(); ) {
408     SnapRealm *realm = *p;
409     if (realm != child &&
410     child->inode->is_projected_ancestor_of(realm->inode)) {
411       dout(20) << " child gets child realm " << *realm << " on " << *realm->inode << dendl;
412       realm->parent = child;
413       child->open_children.insert(realm);
414       open_children.erase(p++);
415     } else {
416       dout(20) << "    keeping child realm " << *realm << " on " << *realm->inode << dendl;
417       ++p;
418     }
419   }
420 
421   // split inodes_with_caps
422   elist<CInode*>::iterator p = inodes_with_caps.begin(member_offset(CInode, item_caps));
423   while (!p.end()) {
424     CInode *in = *p;
425     ++p;
426 
427     // does inode fall within the child realm?
428     bool under_child = false;
429 
430     if (in == child->inode) {
431       under_child = true;
432     } else {
433       CInode *t = in;
434       while (t->get_parent_dn()) {
435     t = t->get_parent_dn()->get_dir()->get_inode();
436     if (t == child->inode) {
437       under_child = true;
438       break;
439     }
440     if (t == in)
441       break;
442       }
443     }
444     if (under_child) {
445       dout(20) << " child gets " << *in << dendl;
446       in->move_to_realm(child);         //脱离containing_realm,加入到child realm
447     } else {
448       dout(20) << "    keeping " << *in << dendl;
449     }
450   }
451 
452 }
453 
454 const bufferlist& SnapRealm::get_snap_trace()
455 {
456   check_cache();
457   return cached_snap_trace;
458 }
459 
460 void SnapRealm::build_snap_trace(bufferlist& snapbl)    //将snaps和所有parents存储在snapbl中
461 {
462   SnapRealmInfo info(inode->ino(), srnode.created, srnode.seq, srnode.current_parent_since);
463 
464   if (parent) {
465     info.h.parent = parent->inode->ino();
466     if (!srnode.past_parents.empty()) {
467       snapid_t last = srnode.past_parents.rbegin()->first;
468       set<snapid_t> past;
469       snapid_t max_seq, max_last_created, max_last_destroyed;
470       build_snap_set(past, max_seq, max_last_created, max_last_destroyed, 0, last);
471       info.prior_parent_snaps.reserve(past.size());
472       for (set<snapid_t>::reverse_iterator p = past.rbegin(); p != past.rend(); ++p)
473     info.prior_parent_snaps.push_back(*p);
474       dout(10) << "build_snap_trace prior_parent_snaps from [1," << last << "] "
475            << info.prior_parent_snaps << dendl;
476     }
477   } else 
478     info.h.parent = 0;
479 
480   info.my_snaps.reserve(srnode.snaps.size());
481   for (map<snapid_t,SnapInfo>::reverse_iterator p = srnode.snaps.rbegin();
482        p != srnode.snaps.rend();
483        ++p)
484     info.my_snaps.push_back(p->first);
485   dout(10) << "build_snap_trace my_snaps " << info.my_snaps << dendl;
486 
487   ::encode(info, snapbl);
488 
489   if (parent)
490     parent->build_snap_trace(snapbl);
491 }
492 
493 
494 //删除past_parents中有但cached_snaps中没有的
495 void SnapRealm::prune_past_parents()
496 {
497   dout(10) << "prune_past_parents" << dendl;
498   check_cache();
499   assert(open);
500 
501   map<snapid_t, snaplink_t>::iterator p = srnode.past_parents.begin();
502   while (p != srnode.past_parents.end()) {
503     set<snapid_t>::iterator q = cached_snaps.lower_bound(p->second.first);
504     if (q == cached_snaps.end() ||   //若cached_snaps中没有p指向的值则erase
505     *q > p->first) {
506       dout(10) << "prune_past_parents pruning [" << p->second.first << "," << p->first 
507            << "] " << p->second.ino << dendl;
508       srnode.past_parents.erase(p++);
509     } else {
510       dout(10) << "prune_past_parents keeping [" << p->second.first << "," << p->first 
511            << "] " << p->second.ino << dendl;
512       ++p;
513     }
514   }
515 }

snapserver.cc:

1 #include "SnapServer.h"
  2 #include "MDS.h"
  3 #include "osd/OSDMap.h"
  4 #include "osdc/Objecter.h"
  5 #include "mon/MonClient.h"
  6 
  7 #include "include/types.h"
  8 #include "messages/MMDSTableRequest.h"
  9 #include "messages/MRemoveSnaps.h"
 10 
 11 #include "msg/Messenger.h"
 12 
 13 #include "common/config.h"
 14 #include "include/assert.h"
 15 
 16 #define dout_subsys ceph_subsys_mds
 17 #undef dout_prefix
 18 #define dout_prefix *_dout << "mds." << rank << ".snap "
 19 
 20 
 21 void SnapServer::reset_state()  //重置状态,将snaps和need_to_purge中的内容清空
 22 {
 23   last_snap = 1;  /* snapid 1 reserved for initial root snaprealm */
 24   snaps.clear();
 25   need_to_purge.clear();
 26 
 27   // find any removed snapshot in data pools
 28   snapid_t first_free = 0;
 29   const OSDMap *osdmap = mds->objecter->get_osdmap_read();
 30   for (set<int64_t>::const_iterator p = mds->mdsmap->get_data_pools().begin();
 31        p != mds->mdsmap->get_data_pools().end();
 32        ++p) {
 33     const pg_pool_t *pi = osdmap->get_pg_pool(*p);
 34     if (!pi->removed_snaps.empty() &&
 35         pi->removed_snaps.range_end() > first_free)
 36       first_free = pi->removed_snaps.range_end();
 37   }
 38   mds->objecter->put_osdmap_read();
 39   if (first_free > last_snap)
 40     last_snap = first_free;        //扩展last_snap范围
 41 }
 42 
 43 
 44 // SERVER
 45 //根据操作执行:创建-将bl中的内容加入到pending_creat中,销毁-将bl中的内容加入到pending_destroy中
 46 void SnapServer::_prepare(bufferlist &bl, uint64_t reqid, mds_rank_t bymds)
 47 {
 48   bufferlist::iterator p = bl.begin();
 49   __u32 op;
 50   ::decode(op, p);
 51 
 52   switch (op) {
 53   case TABLE_OP_CREATE:
 54     {
 55       version++;
 56 
 57       SnapInfo info;
 58       ::decode(info.ino, p);
 59       if (!p.end()) {
 60     ::decode(info.name, p);
 61     ::decode(info.stamp, p);
 62     info.snapid = ++last_snap;
 63     pending_create[version] = info;
 64     dout(10) << "prepare v" << version << " create " << info << dendl;
 65       } else {
 66     pending_noop.insert(version);
 67     dout(10) << "prepare v" << version << " noop" << dendl;
 68       }
 69       bl.clear();
 70       ::encode(last_snap, bl);
 71     }
 72     break;
 73 
 74   case TABLE_OP_DESTROY:
 75     {
 76       inodeno_t ino;
 77       snapid_t snapid;
 78       ::decode(ino, p);    // not used, currently.
 79       ::decode(snapid, p);
 80       version++;
 81 
 82       // bump last_snap... we use it as a version value on the snaprealm.
 83       ++last_snap;
 84 
 85       pending_destroy[version] = pair<snapid_t,snapid_t>(snapid, last_snap);
 86       dout(10) << "prepare v" << version << " destroy " << snapid << " seq " << last_snap << dendl;
 87 
 88       bl.clear();
 89       ::encode(last_snap, bl);
 90     }
 91     break;
 92 
 93   default:
 94     assert(0);
 95   }
 96   //dump();
 97 }
 98 
 99 //返回容器中此id出现的次数
100 bool SnapServer::_is_prepared(version_t tid)
101 {
102   return 
103     pending_create.count(tid) ||
104     pending_destroy.count(tid);
105 }
106 
107 //将操作pending_create或pending_destroy的内容存入对应的结构中
108 bool SnapServer::_commit(version_t tid, MMDSTableRequest *req)
109 {
110   if (pending_create.count(tid)) {    //若此tid在pending_creat中有值,清空并将此结构中对应snapid中的内容存入snaps中
111     dout(7) << "commit " << tid << " create " << pending_create[tid] << dendl;
112     snaps[pending_create[tid].snapid] = pending_create[tid];
113     pending_create.erase(tid);
114   } 
115 
116   else if (pending_destroy.count(tid)) {      //若此tid在pending_creat中有值,清空并将对应内容加入到need_to_purge
117     snapid_t sn = pending_destroy[tid].first;    //removed_snap
118     snapid_t seq = pending_destroy[tid].second;   //seq
119     dout(7) << "commit " << tid << " destroy " << sn << " seq " << seq << dendl;
120     snaps.erase(sn);
121 
122     for (set<int64_t>::const_iterator p = mds->mdsmap->get_data_pools().begin();
123      p != mds->mdsmap->get_data_pools().end();
124      ++p) {
125       need_to_purge[*p].insert(sn);
126       need_to_purge[*p].insert(seq);
127     }
128 
129     pending_destroy.erase(tid);
130   }
131   else if (pending_noop.count(tid)) {     //清空pending_noop
132     dout(7) << "commit " << tid << " noop" << dendl;
133     pending_noop.erase(tid);
134   }
135   else
136     assert(0);
137 
138   // bump version.
139   version++;
140   //dump();
141   return true;
142 }
143 
144 //输出要执行的回滚命令并清空pending_create等结构中的内容,不执行具体操作
145 void SnapServer::_rollback(version_t tid) 
146 {
147   if (pending_create.count(tid)) {
148     dout(7) << "rollback " << tid << " create " << pending_create[tid] << dendl;
149     pending_create.erase(tid);
150   } 
151 
152   else if (pending_destroy.count(tid)) {
153     dout(7) << "rollback " << tid << " destroy " << pending_destroy[tid] << dendl;
154     pending_destroy.erase(tid);
155   }
156   
157   else if (pending_noop.count(tid)) {
158     dout(7) << "rollback " << tid << " noop" << dendl;
159     pending_noop.erase(tid);
160   }    
161 
162   else
163     assert(0);
164 
165   // bump version.
166   version++;
167   //dump();
168 }
169 
170 //将传入参数bl中的项在need_to_purge的对应项擦除
171 void SnapServer::_server_update(bufferlist& bl)
172 {
173   bufferlist::iterator p = bl.begin();
174   map<int, vector<snapid_t> > purge;
175   ::decode(purge, p);
176 
177   dout(7) << "_server_update purged " << purge << dendl;
178   for (map<int, vector<snapid_t> >::iterator p = purge.begin();
179        p != purge.end();
180        ++p) {
181     for (vector<snapid_t>::iterator q = p->second.begin();
182      q != p->second.end();
183      ++q)
184       need_to_purge[p->first].erase(*q);
185     if (need_to_purge[p->first].empty())
186       need_to_purge.erase(p->first);
187   }
188 
189   version++;
190 }
191 
192 void SnapServer::handle_query(MMDSTableRequest *req)
193 {
194   req->put();
195 }
196 
197 
198 //遍历need_to_purge,执行remove的操作
199 void SnapServer::check_osd_map(bool force)
200 {
201   if (!force && version == last_checked_osdmap) {
202     dout(10) << "check_osd_map - version unchanged" << dendl;
203     return;
204   }
205   dout(10) << "check_osd_map need_to_purge=" << need_to_purge << dendl;
206 
207   map<int, vector<snapid_t> > all_purge;
208   map<int, vector<snapid_t> > all_purged;
209 
210   const OSDMap *osdmap = mds->objecter->get_osdmap_read();
211   for (map<int, set<snapid_t> >::iterator p = need_to_purge.begin();       //遍历need_to_purge
212        p != need_to_purge.end();
213        ++p) {
214     int id = p->first;
215     const pg_pool_t *pi = osdmap->get_pg_pool(id);
216     for (set<snapid_t>::iterator q = p->second.begin();
217      q != p->second.end();
218      ++q) {
219       if (pi->is_removed_snap(*q)) {     //已经remove,存入all_purged,还没有remove则存入all_purge
220     dout(10) << " osdmap marks " << *q << " as removed" << dendl;
221     all_purged[id].push_back(*q);
222       } else {
223     all_purge[id].push_back(*q);
224       }
225     }
226   }
227   mds->objecter->put_osdmap_read();
228 
229   if (!all_purged.empty()) {  //已经purge,调用do_server_update更新mdlog
230     // prepare to remove from need_to_purge list
231     bufferlist bl;
232     ::encode(all_purged, bl);
233     do_server_update(bl);
234   }
235 
236   if (!all_purge.empty()) {        //需要purge,创建消息存储all_purge并发送给mon
237     dout(10) << "requesting removal of " << all_purge << dendl;
238     MRemoveSnaps *m = new MRemoveSnaps(all_purge);
239     mds->monc->send_mon_message(m);
240   }
241 
242   last_checked_osdmap = version;
243 }
244 
245 
246 
247 void SnapServer::dump(Formatter *f) const
248 {
249   f->open_object_section("snapserver");
250 
251   f->dump_int("last_snap", last_snap.val);
252 
253   f->open_array_section("pending_noop");
254   for(set<version_t>::const_iterator i = pending_noop.begin(); i != pending_noop.end(); ++i) {
255     f->dump_unsigned("version", *i);
256   }
257   f->close_section();
258 
259   f->open_array_section("snaps");
260   for (map<snapid_t, SnapInfo>::const_iterator i = snaps.begin(); i != snaps.end(); ++i) {
261     f->open_object_section("snap");
262     i->second.dump(f);
263     f->close_section();
264   }
265   f->close_section();
266 
267   f->open_object_section("need_to_purge");
268   for (map<int, set<snapid_t> >::const_iterator i = need_to_purge.begin(); i != need_to_purge.end(); ++i) {
269     stringstream pool_id;
270     pool_id << i->first;
271     f->open_array_section(pool_id.str().c_str());
272     for (set<snapid_t>::const_iterator s = i->second.begin(); s != i->second.end(); ++s) {
273       f->dump_unsigned("snapid", s->val);
274     }
275     f->close_section();
276   }
277   f->close_section();
278 
279   f->open_array_section("pending_create");
280   for(map<version_t, SnapInfo>::const_iterator i = pending_create.begin(); i != pending_create.end(); ++i) {
281     f->open_object_section("snap");
282     f->dump_unsigned("version", i->first);
283     f->open_object_section("snapinfo");
284     i->second.dump(f);
285     f->close_section();
286     f->close_section();
287   }
288   f->close_section();
289 
290   f->open_array_section("pending_destroy");
291   for(map<version_t, pair<snapid_t, snapid_t> >::const_iterator i = pending_destroy.begin(); i != pending_destroy.end(); ++i) {
292     f->open_object_section("snap");
293     f->dump_unsigned("version", i->first);
294     f->dump_unsigned("removed_snap", i->second.first);
295     f->dump_unsigned("seq", i->second.second);
296     f->close_section();
297   }
298   f->close_section();
299 
300   f->close_section();
301 }
302 
303 void SnapServer::generate_test_instances(list<SnapServer*>& ls)
304 {
305   list<SnapInfo*> snapinfo_instances;
306   SnapInfo::generate_test_instances(snapinfo_instances);
307   SnapInfo populated_snapinfo = *(snapinfo_instances.back());
308   for (list<SnapInfo*>::iterator i = snapinfo_instances.begin(); i != snapinfo_instances.end(); ++i) {
309     delete *i;
310   }
311 
312   SnapServer *blank = new SnapServer();
313   ls.push_back(blank);
314   SnapServer *populated = new SnapServer();
315   populated->last_snap = 123;
316   populated->snaps[456] = populated_snapinfo;
317   populated->need_to_purge[2].insert(012);
318   populated->pending_create[234] = populated_snapinfo;
319   populated->pending_destroy[345].first = 567;
320   populated->pending_destroy[345].second = 768;
321   populated->pending_noop.insert(890);
322 
323   ls.push_back(populated);
324 
325 }

snapmapper.h中有一段注释如下,

/**
 * SnapMapper
 *
 * Manages two mappings:
 *  1) hobject_t -> {snapid}    //为每个克隆对象存储快照集
 *  2) snapid -> {hobject_t}    //存储快照集的对象作为其快照
 *
 * We accomplish this using two sets of keys:
 *  1) OBJECT_PREFIX + obj.str() -> encoding of object_snaps
 *  2) MAPPING_PREFIX + snapid_t + obj.str() -> encoding of pair<snapid_t, obj>
 *
 * The on disk strings and encodings are implemented in to_raw, to_raw_key,
 * from_raw, to_object_key.
 *
 * The object -> {snapid} mapping is primarily included so that the
 * SnapMapper state can be verified against the external PG state during
 * scrub etc.
 *
 * The 2) mapping is arranged such that all objects in a particular
 * snap will sort together, and so that all objects in a pg for a
 * particular snap will group under up to 8 prefixes.    //最多8前缀需要进行检查以确定用于特定pg在一个特定的单元的所有对象
 */

snapmapper.cc:

1 #include "SnapMapper.h"
  2 
  3 #define dout_subsys ceph_subsys_osd
  4 #undef dout_prefix
  5 #define dout_prefix *_dout << "snap_mapper."
  6 
  7 using std::string;
  8 
  9 const string SnapMapper::MAPPING_PREFIX = "MAP_";
 10 const string SnapMapper::OBJECT_PREFIX = "OBJ_";
 11 
 12 int OSDriver::get_keys(
 13   const std::set<std::string> &keys,
 14   std::map<std::string, bufferlist> *out)
 15 {
 16   return os->omap_get_values(cid, hoid, keys, out);
 17 }
 18 
 19 int OSDriver::get_next(
 20   const std::string &key,
 21   pair<std::string, bufferlist> *next)
 22 {
 23   ObjectMap::ObjectMapIterator iter =
 24     os->get_omap_iterator(cid, hoid);
 25   if (!iter) {
 26     assert(0);
 27     return -EINVAL;
 28   }
 29   iter->upper_bound(key);
 30   if (iter->valid()) {
 31     if (next)
 32       *next = make_pair(iter->key(), iter->value());
 33     return 0;
 34   } else {
 35     return -ENOENT;
 36   }
 37 }
 38 
 39 struct Mapping {
 40   snapid_t snap;
 41   hobject_t hoid;
 42   Mapping(const pair<snapid_t, hobject_t> &in)
 43     : snap(in.first), hoid(in.second) {}
 44   Mapping() : snap(0) {}
 45   void encode(bufferlist &bl) const {
 46     ENCODE_START(1, 1, bl);
 47     ::encode(snap, bl);
 48     ::encode(hoid, bl);
 49     ENCODE_FINISH(bl);
 50   }
 51   void decode(bufferlist::iterator &bl) {
 52     DECODE_START(1, bl);
 53     ::decode(snap, bl);
 54     ::decode(hoid, bl);
 55     DECODE_FINISH(bl);
 56   }
 57 };
 58 WRITE_CLASS_ENCODER(Mapping)
 59 
 60 string SnapMapper::get_prefix(snapid_t snap)
 61 {
 62   char buf[100];
 63   int len = snprintf(
 64     buf, sizeof(buf),
 65     "%.*X_", (int)(sizeof(snap)*2),
 66     static_cast<unsigned>(snap));
 67   return MAPPING_PREFIX + string(buf, len);
 68 }
 69 
 70 string SnapMapper::to_raw_key(            //实现mapping有两种key,此为raw_key
 71   const pair<snapid_t, hobject_t> &in)
 72 {
 73   return get_prefix(in.first) + shard_prefix + in.second.to_str();
 74 }
 75 
 76 pair<string, bufferlist> SnapMapper::to_raw(    //返回键值和对应map
 77   const pair<snapid_t, hobject_t> &in)
 78 {
 79   bufferlist bl;
 80   ::encode(Mapping(in), bl);
 81   return make_pair(
 82     to_raw_key(in),
 83     bl);
 84 }
 85 
 86 pair<snapid_t, hobject_t> SnapMapper::from_raw(    //根据map返回snapid和hobject对应的snap和hoid 与to_raw相反
 87   const pair<std::string, bufferlist> &image)
 88 {
 89   Mapping map;
 90   bufferlist bl(image.second);
 91   bufferlist::iterator bp(bl.begin());
 92   ::decode(map, bp);
 93   return make_pair(map.snap, map.hoid);
 94 }
 95 
 96 bool SnapMapper::is_mapping(const string &to_test)
 97 {
 98   return to_test.substr(0, MAPPING_PREFIX.size()) == MAPPING_PREFIX;
 99 }
100 
101 string SnapMapper::to_object_key(const hobject_t &hoid)          //实现mapping有两种key,此为object_key
102 {
103   return OBJECT_PREFIX + shard_prefix + hoid.to_str();
104 }
105 
106 void SnapMapper::object_snaps::encode(bufferlist &bl) const
107 {
108   ENCODE_START(1, 1, bl);
109   ::encode(oid, bl);
110   ::encode(snaps, bl);
111   ENCODE_FINISH(bl);
112 }
113 
114 void SnapMapper::object_snaps::decode(bufferlist::iterator &bl)
115 {
116   DECODE_START(1, bl);
117   ::decode(oid, bl);
118   ::decode(snaps, bl);
119   DECODE_FINISH(bl);
120 }
121 
122 int SnapMapper::get_snaps(      //根据iod从backend中找到对应的snaps并输出
123   const hobject_t &oid,
124   object_snaps *out)
125 {
126   assert(check(oid));
127   set<string> keys;
128   map<string, bufferlist> got;
129   keys.insert(to_object_key(oid));
130   int r = backend.get_keys(keys, &got);
131   if (r < 0)
132     return r;
133   if (got.empty())
134     return -ENOENT;
135   if (out) {
136     bufferlist::iterator bp = got.begin()->second.begin();
137     ::decode(*out, bp);
138     dout(20) << __func__ << " " << oid << " " << out->snaps << dendl;
139     assert(!out->snaps.empty());
140   } else {
141     dout(20) << __func__ << " " << oid << " (out == NULL)" << dendl;
142   }
143   return 0;
144 }
145 
146 void SnapMapper::clear_snaps(         //remove to_object_key
147   const hobject_t &oid,
148   MapCacher::Transaction<std::string, bufferlist> *t)
149 {
150   assert(check(oid));
151   set<string> to_remove;
152   to_remove.insert(to_object_key(oid));
153   backend.remove_keys(to_remove, t);
154 }
155 
156 void SnapMapper::set_snaps(           //将object_key和snap设置为一个map,并调用set_keys加入到Transaction结构中
157   const hobject_t &oid,
158   const object_snaps &in,
159   MapCacher::Transaction<std::string, bufferlist> *t)
160 {
161   assert(check(oid));
162   map<string, bufferlist> to_set;
163   bufferlist bl;
164   ::encode(in, bl);
165   to_set[to_object_key(oid)] = bl;
166   backend.set_keys(to_set, t);
167 }
168 
169 int SnapMapper::update_snaps(          //将new_snaps更新加入backend中
170   const hobject_t &oid,
171   const set<snapid_t> &new_snaps,
172   const set<snapid_t> *old_snaps_check,
173   MapCacher::Transaction<std::string, bufferlist> *t)
174 {
175   dout(20) << __func__ << " " << oid << " " << new_snaps
176        << " was " << (old_snaps_check ? *old_snaps_check : set<snapid_t>())
177        << dendl;
178   assert(check(oid));
179   if (new_snaps.empty())
180     return remove_oid(oid, t);
181 
182   object_snaps out;
183   int r = get_snaps(oid, &out);
184   if (r < 0)
185     return r;
186   if (old_snaps_check)
187     assert(out.snaps == *old_snaps_check);
188 
189   object_snaps in(oid, new_snaps);
190   set_snaps(oid, in, t);
191 
192   set<string> to_remove;
193   for (set<snapid_t>::iterator i = out.snaps.begin();
194        i != out.snaps.end();
195        ++i) {
196     if (!new_snaps.count(*i)) {
197       to_remove.insert(to_raw_key(make_pair(*i, oid)));
198     }
199   }
200   backend.remove_keys(to_remove, t);
201   return 0;
202 }
203 
204 void SnapMapper::add_oid(
205   const hobject_t &oid,
206   const set<snapid_t>& snaps,
207   MapCacher::Transaction<std::string, bufferlist> *t)
208 {
209   dout(20) << __func__ << " " << oid << " " << snaps << dendl;
210   assert(check(oid));
211   {
212     object_snaps out;
213     int r = get_snaps(oid, &out);
214     assert(r == -ENOENT);
215   }
216 
217   object_snaps _snaps(oid, snaps);
218   set_snaps(oid, _snaps, t);
219 
220   map<string, bufferlist> to_add;
221   for (set<snapid_t>::iterator i = snaps.begin();
222        i != snaps.end();
223        ++i) {
224     to_add.insert(to_raw(make_pair(*i, oid)));
225   }
226   backend.set_keys(to_add, t);  //将to_add和t打包加入backend
227 }
228 
229 int SnapMapper::get_next_object_to_trim(
230   snapid_t snap,
231   hobject_t *hoid)
232 {
233   for (set<string>::iterator i = prefixes.begin();
234        i != prefixes.end();
235        ++i) {
236     string list_after(get_prefix(snap) + *i);
237 
238     pair<string, bufferlist> next;
239     int r = backend.get_next(list_after, &next);
240     if (r < 0) {
241       break; // Done
242     }
243 
244     if (next.first.substr(0, list_after.size()) !=
245     list_after) {
246       continue; // Done with this prefix
247     }
248 
249     assert(is_mapping(next.first));
250 
251     pair<snapid_t, hobject_t> next_decoded(from_raw(next));
252     assert(next_decoded.first == snap);
253     assert(check(next_decoded.second));
254 
255     if (hoid)
256       *hoid = next_decoded.second;
257     return 0;
258   }
259   return -ENOENT;
260 }
261 
262 
263 int SnapMapper::remove_oid(
264   const hobject_t &oid,
265   MapCacher::Transaction<std::string, bufferlist> *t)
266 {
267   dout(20) << __func__ << " " << oid << dendl;
268   assert(check(oid));
269   return _remove_oid(oid, t);
270 }
271 
272 int SnapMapper::_remove_oid(
273   const hobject_t &oid,
274   MapCacher::Transaction<std::string, bufferlist> *t)
275 {
276   object_snaps out;
277   int r = get_snaps(oid, &out);
278   if (r < 0)
279     return r;
280 
281   clear_snaps(oid, t);       //remove to_object_key
282 
283   set<string> to_remove;
284   for (set<snapid_t>::iterator i = out.snaps.begin();
285        i != out.snaps.end();
286        ++i) {
287     to_remove.insert(to_raw_key(make_pair(*i, oid)));
288   }
289   backend.remove_keys(to_remove, t);    //remove to_raw_key
290   return 0;
291 }
292 
293 int SnapMapper::get_snaps(         //根据oid找到对应snaps并存在set容器snaps中
294   const hobject_t &oid,
295   std::set<snapid_t> *snaps)
296 {
297   assert(check(oid));
298   object_snaps out;
299   int r = get_snaps(oid, &out);
300   if (r < 0)
301     return r;
302   if (snaps)
303     snaps->swap(out.snaps);
304   return 0;
305 }