"use strict";
var P = require('bluebird');
var Template = require('swagger-router').Template;
var makeFileStore = require('./filestore');
var makeSQLiteStore = require('./sqlitestore');
// Enable heap dumps in /tmp on kill -USR2.
// See https://github.com/bnoordhuis/node-heapdump/
// For node 0.6/0.8: npm install heapdump@0.1.0
// For 0.10: npm install heapdump
process.on('SIGUSR2', function() {
var heapdump = require('heapdump');
console.log( "warning", "SIGUSR2 received! Writing snapshot." );
process.chdir('/tmp');
heapdump.writeSnapshot();
});
var preq = require('preq');
var PromiseStream = require('./PromiseStream');
// the request template used for fetching each individual article
var articleReqTpl;
function getArticles (options, res) {
if (!res || res.next === 'finished') {
// nothing more to do.
return P.resolve(null);
}
var next = res.next || '';
var query = {
action: 'query',
generator: 'allpages',
gaplimit: '500',
gapfrom: options.startTitle,
prop: 'revisions',
gapnamespace: options.ns,
format: 'json',
'continue': '',
}
// merge in the paging parameters
if (next) {
Object.assign(query, next);
}
return preq.get({
uri: options.apiURL,
query: query,
headers: {
'user-agent': options.userAgent,
host: options.prefix
},
timeout: 60* 1000,
retries: 5
})
.then(function(res2) {
res2 = res2.body;
var articles = [];
var articleChunk = res2.query.pages;
Object.keys(articleChunk).forEach( function(key) {
var article = articleChunk[key];
if ( article.revisions !== undefined ) {
var title = article.title.replace( / /g, '_' );
articles.push([title, article.revisions[0].revid]);
}
});
var next2 = res2['continue'] || 'finished';
// XXX
//next = 'finished';
return {
articles: articles,
next: next2,
encoding: null
};
})
.catch(function(e) {
console.error('Error in getArticles:', e);
throw e;
});
}
function dumpArticle (options, title, oldid) {
var checkRevision;
if (options.store) {
checkRevision = options.store.checkArticle(title, oldid);
} else {
checkRevision = P.resolve(false);
}
return checkRevision
.then(function(checkResult) {
if (!checkResult) {
if (options.verbose) {
console.log(`Dumping ${options.prefix} / ${title} / ${oldid}`);
}
return preq.get(articleReqTpl.expand({
request: {
params: Object.assign({title: title, oldid: oldid}, options)
}
}))
.then(function(res) {
if (options.store) {
return options.store.saveArticle(res.body, title, oldid);
}
});
} else if (options.verbose) {
console.log('Exists:', title, oldid);
}
});
}
// Processes chunks of articles one by one
function Dumper (articleChunkStream, options) {
this.articleChunkStream = articleChunkStream;
this.options = options;
this.articles = [];
this.waiters = [];
this.done = false;
}
Dumper.prototype.processArticles = function (newArticles) {
if (newArticles === null) {
this.done = true;
while(this.waiters.length) {
this.waiters.pop().resolve(null);
}
return;
}
this.articles = newArticles.articles;
while(this.waiters.length && this.articles.length) {
this.waiters.pop().resolve(this.articles.shift());
}
if (this.waiters.length) {
this.articleChunkStream.next().then(this.processArticles.bind(this));
}
};
Dumper.prototype.getArticle = function () {
var self = this;
if (this.articles.length) {
return P.resolve(this.articles.shift());
} else {
if (!this.waiters.length) {
this.articleChunkStream.next().then(this.processArticles.bind(this));
}
return new P(function(resolve, reject) {
self.waiters.push({resolve: resolve, reject: reject});
});
}
};
Dumper.prototype.next = function () {
var self = this;
return this.getArticle()
.then(function(article) {
if (article === null) {
return null;
}
var title = article[0];
var oldid = article[1];
return dumpArticle(self.options, title, oldid)
.catch(function(e) {
console.error('Error in htmldumper:', title, oldid, e);
});
});
};
function dumpLoop (options) {
var articleChunkStream = new PromiseStream(getArticles.bind(null, options),
{next: ''}, 6);
var dumper = new Dumper(articleChunkStream, options);
var dumpStream = new PromiseStream(dumper.next.bind(dumper),
undefined, 1, options.concurrency);
var i = 0;
return new P(function(resolve, reject) {
function loop () {
return dumpStream.next()
.then(function (res) {
if (res === null) {
return resolve();
}
if (i++ === 10000) {
i = 0;
process.nextTick(loop);
} else {
return loop();
}
})
.catch(function(e) {
if (e instanceof String) {
resolve(e);
} else {
reject(e);
}
});
}
return loop();
});
}
function makeDump (options) {
var storeSetup = P.resolve();
if (options.saveDir) {
storeSetup = makeFileStore(options);
} else if (options.dataBase) {
storeSetup = makeSQLiteStore(options);
}
// set up the article request template once on start-up
articleReqTpl = new Template({
method: 'get',
uri: options.url,
headers: {
'user-agent': options.userAgent,
'accept-encoding': 'gzip',
'cache-control': options.noCache ? 'no-cache' : undefined
},
retries: 5,
timeout: 60000,
// Request a Buffer by default, don't decode to a String. This
// saves CPU cycles, but also a lot of memory as large strings are
// stored in the old space of the JS heap while Buffers are stored
// outside the JS heap.
encoding: null
});
return storeSetup
.then(function(store) {
options.store = store;
return dumpLoop(options);
})
.then(function() {
if (options.store && options.store.close) {
return options.store.close();
}
});
}
module.exports = makeDump;