1
|
hidden var 'fp7pos' from select jmergeregexp(jgroup(c1)) from (select * from fp7positives order by length(C1) desc) ;
|
2
|
hidden var 'fp7negheavy' from select jmergeregexp(jgroup(c1)) from (select * from fp7strongfilterwords order by length(C1) desc);
|
3
|
hidden var 'fp7neglight' from select jmergeregexp(jgroup(c1)) from (select * from fp7weakfilterwords order by length(C1) desc);
|
4
|
hidden var 'fp7pospos' from select jmergeregexp(jgroup(c1)) from (select * from fp7pospos order by length(C1) desc);
|
5
|
hidden var 'fp7middlepos' from select jmergeregexp(jgroup(c1)) from (select * from fp7positives union select * from fp7pospos union select * from fp7middlepos);
|
6
|
hidden var 'wtnegheavy' from select jmergeregexp(jgroup(c1)) from (select * from wtstrongfilterwords order by length(C1) desc);
|
7
|
hidden var 'wtneglight' from select jmergeregexp(jgroup(c1)) from (select * from wtweakfilterwords order by length(C1) desc);
|
8
|
hidden var 'wtpospos' from select jmergeregexp(jgroup(c1)) from (select * from wtposposwords order by length(C1) desc);
|
9
|
|
10
|
create temp table pubs as setschema 'c1,c2' select jsonpath(c1, '$.id', '$.text') from stdinput();
|
11
|
|
12
|
select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8) from (
|
13
|
select docid,id from (select docid,upper(regexpr("(\w+.*\d+)",middle)) as match,id,grantid from (select c1 as docid,textwindow2s(regexpr("\n",utf8clean(c2)," "),0,1,0,".+\/\w+\/\d{4}\W*\Z") from pubs ) , grants where match = grantid) group by docid,id)
|
14
|
|
15
|
union all
|
16
|
|
17
|
select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', sqroot(min(1.49,confidence)/1.5)) from ( select docid,id,max(confidence) as confidence from ( select docid, id,
|
18
|
case when fundingClass="WT" then /*wellcome trust confidence*/
|
19
|
(regexpcountwords(var('wtpospos'),j2s(prevpack,nextpack)) * regexpcountwords('(?:collaborative|joint call)',j2s(prevpack,nextpack)))*0.33 +
|
20
|
regexprmatches('\d{5}ma(?:\b|_)',middle)+
|
21
|
regexprmatches('(?:\d{5}_)(?:z|c|b|a)(?:_\d{2}_)(?:z|c|b|a)',middle)*2+
|
22
|
regexpcountwords(var('wtpospos'),prev)*0.5+
|
23
|
regexprmatches(var('wtpospos'),middle)+
|
24
|
regexpcountwithpositions(var('wtpospos'),prevpack)*0.39 +
|
25
|
0.21*regexpcountwithpositions(var('wtpospos'),nextpack,1) -
|
26
|
(regexprmatches(var('wtnegheavy'),middle) + regexprmatches('(?:n01|r01|dms)',middle) +regexprmatches('(?:ns|mh|hl|hd|ai)(?:_|)\d{5}',middle))*10 -
|
27
|
4*regexpcountwords('(?:a|g|c|t|u){4}',middle) -
|
28
|
regexprmatches(var('wtneglight'),middle)*0.3 -
|
29
|
regexpcountwithpositions(var('wtnegheavy'),prevpacksmall,0,1,0.5)*0.39 -
|
30
|
regexpcountwithpositions(var('wtneglight'),prevpacksmall,0,1,0.5)*0.18 -
|
31
|
0.45*regexpcountwithpositions(var('wtneglight'),nextpack) -
|
32
|
0.21*regexpcountwithpositions(var('wtnegheavy'),nextpack,1)
|
33
|
else /* fp7 confidence */
|
34
|
regexprmatches(var('fp7middlepos'),middle)+
|
35
|
regexprmatches('(?:\b|_|\d)'||normalizedacro||'(?:\b|_|\d)',j2s(middle,prevpacksmall,nextpack))*2 +
|
36
|
regexprmatches('fp7',prev15)*0.4 +
|
37
|
0.4*regexpcountwithpositions(var('fp7pospos'),prevpacksmall) +
|
38
|
0.16*regexpcountwords(var('fp7pos'),prevpacksmall) +
|
39
|
0.1*regexpcountwithpositions(var('fp7pospos'),nextpack,1) +
|
40
|
regexpcountwords(var('fp7pos'),nextpack)*0.04 -
|
41
|
regexprmatches(var('fp7negheavy'),middle)*1 -
|
42
|
0.4*regexpcountwords('(a|g|c|t|u){4}',middle) -
|
43
|
regexprmatches(var('fp7neglight'),middle)*0.3 -
|
44
|
regexpcountwithpositions(var('fp7negheavy'),prevpacksmall)*0.48 -
|
45
|
regexpcountwithpositions(var('fp7neglight'),prevpacksmall)*0.18 -
|
46
|
(((regexpcountwords(('\b_*\d+_*\b'),prevpacksmall)+ (regexpcountwords(('\b_*\d+_*\b'),nextpack)))/4))*0.2 -
|
47
|
regexpcountwithpositions(var('fp7neglight'),nextpack)*0.03 -
|
48
|
regexpcountwithpositions(var('fp7negheavy'),nextpack,1)*0.08
|
49
|
end as confidence
|
50
|
from
|
51
|
( select id,fundingClass,docid,normalizedacro, j2s(prev14,prev15) as prev,grantid,prev15,j2s(prev1, prev2, prev3, prev4, prev5,prev6,prev7,prev8,prev9,prev10,prev11,prev12,prev13,prev14,prev15) as prevpack ,j2s(prev9,prev10,prev11,prev12,prev13,prev14,prev15) as prevpacksmall , middle, j2s(next1, next2, next3) as nextpack
|
52
|
from
|
53
|
(
|
54
|
select * from (setschema 'docid,prev1, prev2, prev3, prev4, prev5, prev6, prev7, prev8, prev9, prev10, prev11, prev12, prev13, prev14, prev15, middle, next1, next2, next3' select c1 as docid ,textwindow(regexpr('(\b\S*?[^0-9\s_]\S*?\s_?)(\d{3})(\s)(\d{3})(_?\s\S*?[^0-9\s_]\S*?\b)',filterstopwords(normalizetext(lower(c2))),'\1\2\4\5'),15,3,'((?:(?:\b|\D)0|_|\b|\D)(?:\d{5}))|(((\D|\b)\d{6}(\D|\b))) ' )
|
55
|
from (select * from pubs where c2 is not null)) ,grants
|
56
|
where (not regexprmatches( '(?:0|\D|\b)+(?:\d{8,})',middle) and regexpr('(?:0|\D|\b)+(\d{5})',middle) = grantid) or ((not regexprmatches('(\d{6,}(?:\d|i\d{3}_?\b))|(jana\d{6,})', middle))
|
57
|
and regexpr('(\d{6})',middle) = grantid)
|
58
|
)
|
59
|
) where confidence > 0) group by docid,id);
|