写在前面:消息订阅与推送都通过nsq的tcp服务实现。
关于消息的推送最重要的是两个文件:nsqd/protocol_v2.go和nsqd/client_v2.go。
当一个客户端与nsqd进程建立了一个tcp连接时,会调用protocolV2.IOLoop方法,并新建一个clientV2结构体对象。IOLoop方法会启动一个协程执行messagePump方法。并且对于每一个tcp连接,都会有两个协程:运行IOLoop的协程用于接收客户端的请求;运行messagePump的负责处理数据,把数据给客户端clientV2推送给客户端。
下面具体来讲
首先需要客户端订阅消息,会调用“SUB”接口,最终会到nsqd/protocol_v2.go中的SUB函数,函数逻辑如下
func (p *protocolV2) SUB(client *clientV2, params [][]byte) ([]byte, error) {
// 客户端状态必须是初始化状态
if atomic.LoadInt32(&client.State) != stateInit {
return nil, protocol.NewFatalClientErr(nil, "E_INVALID", "cannot SUB in current state")
}
// 心跳间隔不能小于等于0
if client.HeartbeatInterval <= 0 {
return nil, protocol.NewFatalClientErr(nil, "E_INVALID", "cannot SUB with heartbeats disabled")
}
// 参数长度必须为3
// params[0]为“SUB”,其实就是url的path
// params[1]为topicName
// params[2]为channelName
if len(params) < 3 {
return nil, protocol.NewFatalClientErr(nil, "E_INVALID", "SUB insufficient number of parameters")
}
// 检查topicName的有效性,主要是格式有效性
topicName := string(params[1])
if !protocol.IsValidTopicName(topicName) {
return nil, protocol.NewFatalClientErr(nil, "E_BAD_TOPIC",
fmt.Sprintf("SUB topic name %q is not valid", topicName))
}
// 检查channelName的有效性,主要是格式有效性
channelName := string(params[2])
if !protocol.IsValidChannelName(channelName) {
return nil, protocol.NewFatalClientErr(nil, "E_BAD_CHANNEL",
fmt.Sprintf("SUB channel name %q is not valid", channelName))
}
// 校验权限
if err := p.CheckAuth(client, "SUB", topicName, channelName); err != nil {
return nil, err
}
// 下面的循环是为了避免将客户端加入到一个已经退出的topic或者channel
var channel *Channel
for {
topic := p.ctx.nsqd.GetTopic(topicName)
channel = topic.GetChannel(channelName)
// 客户端消费channel有最大数量的限制
if err := channel.AddClient(client.ID, client); err != nil {
return nil, protocol.NewFatalClientErr(nil, "E_TOO_MANY_CHANNEL_CONSUMERS",
fmt.Sprintf("channel consumers for %s:%s exceeds limit of %d",
topicName, channelName, p.ctx.nsqd.getOpts().MaxChannelConsumers))
}
if (channel.ephemeral && channel.Exiting()) || (topic.ephemeral && topic.Exiting()) {
channel.RemoveClient(client.ID)
time.Sleep(1 * time.Millisecond)
continue
}
break
}
// 标记客户端状态为“已订阅”
atomic.StoreInt32(&client.State, stateSubscribed)
client.Channel = channel
// 更新message pump,开始推送消息
client.SubEventChan <- channel
return okBytes, nil
}
然后到messagePump中
func (p *protocolV2) messagePump(client *clientV2, startedChan chan bool) {
...
// 这里把之前的client的SubEventChan赋给了subEventChan
subEventChan := client.SubEventChan
...
for {
if subChannel == nil || !client.IsReadyForMessages() {
// the client is not ready to receive messages...
memoryMsgChan = nil
backendMsgChan = nil
flusherChan = nil
// force flush
client.writeLock.Lock()
err = client.Flush()
client.writeLock.Unlock()
if err != nil {
goto exit
}
flushed = true
} else if flushed {
// last iteration we flushed...
// do not select on the flusher ticker channel
memoryMsgChan = subChannel.memoryMsgChan
backendMsgChan = subChannel.backend.ReadChan()
flusherChan = nil
} else {
// we're buffered (if there isn't any more data we should flush)...
// select on the flusher ticker channel, too
// 用于接收和推送消息
memoryMsgChan = subChannel.memoryMsgChan
backendMsgChan = subChannel.backend.ReadChan()
flusherChan = outputBufferTicker.C
}
select {
...
case subChannel = <-subEventChan:
// 把subEventChan设置为nil是为了保证一个客户端不重复订阅
subEventChan = nil
...
case b := <-backendMsgChan:
// 开发推送订阅的消息
if sampleRate > 0 && rand.Int31n(100) > sampleRate {
continue
}
msg, err := decodeMessage(b)
if err != nil {
p.ctx.nsqd.logf(LOG_ERROR, "failed to decode message - %s", err)
continue
}
msg.Attempts++
// 封装msg,并做消息记录,详见下方的介绍
subChannel.StartInFlightTimeout(msg, client.ID, msgTimeout)
// 设置消息发送中状态值,InFlightCount+1,MessageCount+1
client.SendingMessage()
// 发送msg数据给客户端
err = p.SendMessage(client, msg)
if err != nil {
goto exit
}
flushed = false
case msg := <-memoryMsgChan:
if sampleRate > 0 && rand.Int31n(100) > sampleRate {
continue
}
msg.Attempts++
subChannel.StartInFlightTimeout(msg, client.ID, msgTimeout)
client.SendingMessage()
err = p.SendMessage(client, msg)
if err != nil {
goto exit
}
flushed = false
case <-client.ExitChan:
goto exit
}
}
exit:
p.ctx.nsqd.logf(LOG_INFO, "PROTOCOL(V2): [%s] exiting messagePump", client)
heartbeatTicker.Stop()
outputBufferTicker.Stop()
if err != nil {
p.ctx.nsqd.logf(LOG_ERROR, "PROTOCOL(V2): [%s] messagePump error - %s", client, err)
}
}
消息推送会调用StartInFlightTimeout,然后设置一些标志位,用于判断是否可以继续推送消息的情况判断
下面看一下StartInFlightTimeout做了什么
func (c *Channel) StartInFlightTimeout(msg *Message, clientID int64, timeout time.Duration) error {
// 包装一下Message结构体
now := time.Now()
msg.clientID = clientID
msg.deliveryTS = now
msg.pri = now.Add(timeout).UnixNano()
//
err := c.pushInFlightMessage(msg)
if err != nil {
return err
}
// 放入是以timeout时间作为优先级的优先级队列(最小堆),用于保存发送给客户端但是还没有被确认的消息
c.addToInFlightPQ(msg)
return nil
}
// pushInFlightMessage 将message原子地放入map中,后面会用于关闭channel
func (c *Channel) pushInFlightMessage(msg *Message) error {
c.inFlightMutex.Lock()
_, ok := c.inFlightMessages[msg.ID]
if ok {
c.inFlightMutex.Unlock()
return errors.New("ID already in flight")
}
c.inFlightMessages[msg.ID] = msg
c.inFlightMutex.Unlock()
return nil
}
消息推送到此结束,消息推送完,消费端消费完消息需要调用“FIN”接口,来表示此消息已经消费完,逻辑如下
func (p *protocolV2) FIN(client *clientV2, params [][]byte) ([]byte, error) {
// 查看客户端状态必须为订阅或者关闭状态
state := atomic.LoadInt32(&client.State)
if state != stateSubscribed && state != stateClosing {
return nil, protocol.NewFatalClientErr(nil, "E_INVALID", "cannot FIN in current state")
}
// params的长度必须大于2,params[0]为“FIN”, params[1]为id的指针
if len(params) < 2 {
return nil, protocol.NewFatalClientErr(nil, "E_INVALID", "FIN insufficient number of params")
}
// 解析id
id, err := getMessageID(params[1])
if err != nil {
return nil, protocol.NewFatalClientErr(nil, "E_INVALID", err.Error())
}
// 此操作为StartInFlightTimeout的逆操作,并加了一个状态跟踪的的功能,不赘述
err = client.Channel.FinishMessage(client.ID, *id)
if err != nil {
return nil, protocol.NewClientErr(err, "E_FIN_FAILED",
fmt.Sprintf("FIN %s failed %s", *id, err.Error()))
}
// FinishCount+1, InFlightCount-1, 并给ReadyStateChan通道赋值,推动messagePump进入下一个循环
client.FinishedMessage()
return nil, nil
}
顺便提一下“RDY”接口,设置推送消息的准备中的条数上限,和InFlightCount(发送中的消息条数)对比,inFlightCount >= readyCount则不推送消息