lua字符串内部存储分为短字符串和长字符串,可以从下面的宏看出:
#define LUA_TSTRING 4
/* Variant tags for strings */
#define LUA_TSHRSTR (LUA_TSTRING | (0 << 4)) /* short strings */
#define LUA_TLNGSTR (LUA_TSTRING | (1 << 4)) /* long strings */
可以看出短字符串为4;长的是20;
TString类型定义为lobejct里:
/*
** Header for string value; string bytes follow the end of this structure
*/
typedef union TString {
L_Umaxalign dummy; /* ensures maximum alignment for strings */
struct {
CommonHeader;
lu_byte extra; /* 对于段字符串表示是否为保留字;对于长字符串用于lazy hash*/
unsigned int hash;
size_t len; /* 字符串长度【lua内string不以'\0'结尾,所以需要显式指定大小】 */
} tsv;
} TString;
前面的 dummy是一个宏:
#define LUAI_USER_ALIGNMENT_T union { double u; void *s; long l; }
这个union就是8个字节;
TString也是个union,因此大小一定是8的倍数,这就保证了TString大小一定是8的倍数。
字符串作为经常被用到的数据类型,会在很多场景下被创建:
比如:
LUA_API const char *lua_pushlstring (lua_State *L, const char *s, size_t len) {
TString *ts;
lua_lock(L);
luaC_checkGC(L);
ts = luaS_newlstr(L, s, len);
setsvalue2s(L, L->top, ts);
api_incr_top(L);
lua_unlock(L);
return getstr(ts);
}
这里就是用luaS_newlstr来创建字符串的,传入了参数lua_State,const char*,len,来看实现:
/*
** new string (with explicit length)
*/
TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
if (l <= LUAI_MAXSHORTLEN) /* short string? */
return internshrstr(L, str, l);
else {
if (l + 1 > (MAX_SIZET - sizeof(TString))/sizeof(char))
luaM_toobig(L);
return createstrobj(L, str, l, LUA_TLNGSTR, G(L)->seed, NULL);
}
}
首先判断如果是短字符串,就内部化;
如果长度太大,就报错;否则 开始创建。
那首先看看如何内部化:
/*
** checks whether short string exists and reuses it or creates a new one
*/
static TString *internshrstr (lua_State *L, const char *str, size_t l) {
GCObject *o;
global_State *g = G(L);
unsigned int h = luaS_hash(str, l, g->seed);
for (o = g->strt.hash[lmod(h, g->strt.size)];
o != NULL;
o = gch(o)->next) {
TString *ts = rawgco2ts(o);
if (h == ts->tsv.hash &&
l == ts->tsv.len &&
(memcmp(str, getstr(ts), l * sizeof(char)) == 0)) {
if (isdead(G(L), o)) /* string is dead (but was not collected yet)? */
changewhite(o); /* resurrect it */
return ts;
}
}
return newshrstr(L, str, l, h); /* not found; create a new string */
}
首先根据 字符内容,长度和hash种子获取hash值,然后遍历string table相应hash桶内的字符对象,并用hash值、字符内存、字符长度进行比较,如果都相等,说明已经存在此字符串,如果发现已死,就复活之(这是GC的内容,暂时跳过),然后返回,在这里实现了重用;
当然如果没找到,就要第一次分配对象:
/*
** creates a new short string, inserting it into string table
*/
static TString *newshrstr (lua_State *L, const char *str, size_t l,
unsigned int h) {
GCObject **list; /* (pointer to) list where it will be inserted */
stringtable *tb = &G(L)->strt;
TString *s;
if (tb->nuse >= cast(lu_int32, tb->size) && tb->size <= MAX_INT/2)
luaS_resize(L, tb->size*2); /* too crowded */
list = &tb->hash[lmod(h, tb->size)];
s = createstrobj(L, str, l, LUA_TSHRSTR, h, list);
tb->nuse++;
return s;
}
当新字符串的长度超过string tabler已有string数量的时候,hash值再次计算会发生冲突,因此这时候要进行resize,就是根据新的长度重新映射hash值;
这时候才开始真正分配新string:
/*
** creates a new string object
*/
static TString *createstrobj (lua_State *L, const char *str, size_t l,
int tag, unsigned int h, GCObject **list) {
TString *ts;
size_t totalsize; /* total size of TString object */
totalsize = sizeof(TString) + ((l + 1) * sizeof(char));
ts = &luaC_newobj(L, tag, totalsize, list, 0)->ts;
ts->tsv.len = l;
ts->tsv.hash = h;
ts->tsv.extra = 0;
memcpy(ts+1, str, l*sizeof(char));
((char *)(ts+1))[l] = '\0'; /* ending 0 */
return ts;
}
从totatlesize的计算方法可以看出TSring只是一个头,真正的内容放在TString之后;
然后再来一层 创建对象:
/*
** create a new collectable object (with given type and size) and link
** it to '*list'. 'offset' tells how many bytes to allocate before the
** object itself (used only by states).
*/
GCObject *luaC_newobj (lua_State *L, int tt, size_t sz, GCObject **list,
int offset) {
global_State *g = G(L);
char *raw = cast(char *, luaM_newobject(L, novariant(tt), sz));
GCObject *o = obj2gco(raw + offset);
if (list == NULL)
list = &g->allgc; /* standard list for collectable objects */
gch(o)->marked = luaC_white(g);
gch(o)->tt = tt;
gch(o)->next = *list;
*list = o;
return o;
}
luaM_newobject会调用luaM_realloc_真正分配内存,这里具体的内存分配策略就权当是malloc分配的(暂时如此认为)。此时会把此object链接到stringtable上。还可以看出string table会更新相关域。
好吧,字符串的创建分析完了。