在用hive做nginx日志url 分析的时候,经常需要parse url。
hive中自带的函数parse_url可以实现这个功能,不过它对格式的要求比较严格,不能直接用于nginx log的request字段。
hive -e "select parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST') from dual" facebook.com
hive -e "select parse_url('facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST') from dual" NULL
也可以通过regexp_extract来实现,不过需要写正则,同时性能也有些问题。。
hive -e "select regexp_extract('GET /vips-mobile/router.do?api_key=24415b921531551cb2ba756b885ce783&app_version=1.8.6&fields=sku_id HTTP/1.1','.+? +(.+?)app_version=(.+?)&(.+) .+?',2) from dual" 1.8.6
考虑自己写一个,借鉴parse_url的udf:
代码如下:
package com.hive.myudf; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.hive.ql.exec.UDF; public class UDFNginxParseUrl extends UDF { private String schemal = "http://"; private String host1 = null; private Pattern p1 = null; private URL url = null; private Pattern p = null; private String lastKey = null; public UDFNginxParseUrl() { } public String evaluate(String host1, String urlStr, String partToExtract) { if (host1 == null || urlStr == null || partToExtract == null) { return null; } p1 = Pattern.compile("(.+?) +(.+?) (.+)"); Matcher m1 = p1.matcher(urlStr); if (m1.matches()){ String realUrl = schemal + host1 + m1.group(2); System.out.println("URL is " + realUrl); try{ url = new URL(realUrl); }catch (Exception e){ return null; } } /* if (lastUrlStr == null || !urlStr.equals(lastUrlStr)) { try { url = new URL(urlStr); } catch (Exception e) { return null; } } lastUrlStr = urlStr; */ if (partToExtract.equals("HOST")) { return url.getHost(); } if (partToExtract.equals("PATH")) { return url.getPath(); } if (partToExtract.equals("QUERY")) { return url.getQuery(); } if (partToExtract.equals("REF")) { return url.getRef(); } if (partToExtract.equals("PROTOCOL")) { return url.getProtocol(); } if (partToExtract.equals("FILE")) { return url.getFile(); } if (partToExtract.equals("AUTHORITY")) { return url.getAuthority(); } if (partToExtract.equals("USERINFO")) { return url.getUserInfo(); } return null; } public String evaluate(String host, String urlStr, String partToExtract, String key) { if (!partToExtract.equals("QUERY")) { return null; } String query = this.evaluate(host, urlStr, partToExtract); if (query == null) { return null; } if (!key.equals(lastKey)) { p = Pattern.compile("(&|^)" + key + "=([^&]*)"); } lastKey = key; Matcher m = p.matcher(query); if (m.find()) { return m.group(2); } return null; } }
add jar和create function之后测试:
hive -e "select nginx_url_parse('test.test.com','GET /vips-mobile/router.do?api_key=24415&app_version=1.8.6&fields=sku_id HTTP/1.1','HOST') FROM dual;" test.test.com
hive -e "select nginx_url_parse('test.test.com','GET /vips-mobile/router.do?api_key=24415&app_version=1.8.6&fields=sku_id HTTP/1.1','QUERY','api_key') FROM dual;" 24415
这样就可以直接应用于nginx的日志了。