Project

General

Profile

1
hidden var 'fp7pos' from select jmergeregexp(jgroup(c1)) from (select * from fp7positives order by length(C1) desc) ;
2
hidden var 'fp7negheavy' from select jmergeregexp(jgroup(c1)) from (select * from fp7strongfilterwords order by length(C1) desc);
3
hidden var 'fp7neglight' from select jmergeregexp(jgroup(c1)) from (select * from fp7weakfilterwords order by length(C1) desc);
4
hidden var 'fp7pospos' from select jmergeregexp(jgroup(c1)) from (select * from fp7pospos order by length(C1) desc);
5
hidden var 'fp7middlepos' from select jmergeregexp(jgroup(c1)) from (select * from fp7positives union select * from fp7pospos union select * from fp7middlepos);
6
hidden var 'wtnegheavy' from select jmergeregexp(jgroup(c1)) from (select * from wtstrongfilterwords order by length(C1) desc);
7
hidden var 'wtneglight' from select jmergeregexp(jgroup(c1)) from (select * from wtweakfilterwords order by length(C1) desc);
8
hidden var 'wtpospos' from select jmergeregexp(jgroup(c1)) from (select * from wtposposwords order by length(C1) desc);
9

    
10
create temp table pubs as setschema 'c1,c2' select jsonpath(c1, '$.id', '$.text') from stdinput();
11

    
12
select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8) from (
13
select docid,id from (select docid,upper(regexpr("(\w+.*\d+)",middle)) as match,id,grantid  from (select c1 as docid,textwindow2s(regexpr("\n",utf8clean(c2)," "),0,1,0,".+\/\w+\/\d{4}\W*\Z") from pubs ) , grants where match = grantid) group by docid,id)
14

    
15
union all 
16

    
17
select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', sqroot(min(1.49,confidence)/1.5)) from ( select docid,id,max(confidence) as confidence from ( select docid, id,
18
      case when fundingClass="WT" then /*wellcome trust confidence*/
19
                (regexpcountwords(var('wtpospos'),j2s(prevpack,nextpack)) * regexpcountwords('(?:collaborative|joint call)',j2s(prevpack,nextpack)))*0.33 +
20
                regexprmatches('\d{5}ma(?:\b|_)',middle)+
21
                regexprmatches('(?:\d{5}_)(?:z|c|b|a)(?:_\d{2}_)(?:z|c|b|a)',middle)*2+
22
                regexpcountwords(var('wtpospos'),prev)*0.5+
23
                regexprmatches(var('wtpospos'),middle)+
24
                regexpcountwithpositions(var('wtpospos'),prevpack)*0.39 +
25
                0.21*regexpcountwithpositions(var('wtpospos'),nextpack,1) -
26
                (regexprmatches(var('wtnegheavy'),middle) + regexprmatches('(?:n01|r01|dms)',middle) +regexprmatches('(?:ns|mh|hl|hd|ai)(?:_|)\d{5}',middle))*10 -
27
                4*regexpcountwords('(?:a|g|c|t|u){4}',middle) -
28
                regexprmatches(var('wtneglight'),middle)*0.3 -
29
                regexpcountwithpositions(var('wtnegheavy'),prevpacksmall,0,1,0.5)*0.39 -
30
                regexpcountwithpositions(var('wtneglight'),prevpacksmall,0,1,0.5)*0.18 -
31
                0.45*regexpcountwithpositions(var('wtneglight'),nextpack) -
32
                0.21*regexpcountwithpositions(var('wtnegheavy'),nextpack,1) 
33
       else /* fp7 confidence */
34
		regexprmatches(var('fp7middlepos'),middle)+
35
                regexprmatches('(?:\b|_|\d)'||normalizedacro||'(?:\b|_|\d)',j2s(middle,prevpacksmall,nextpack))*2  +
36
                regexprmatches('fp7',prev15)*0.4 +
37
                0.4*regexpcountwithpositions(var('fp7pospos'),prevpacksmall) +
38
                0.16*regexpcountwords(var('fp7pos'),prevpacksmall) +
39
                0.1*regexpcountwithpositions(var('fp7pospos'),nextpack,1) +
40
                regexpcountwords(var('fp7pos'),nextpack)*0.04 -
41
                regexprmatches(var('fp7negheavy'),middle)*1 -
42
                0.4*regexpcountwords('(a|g|c|t|u){4}',middle) -
43
                regexprmatches(var('fp7neglight'),middle)*0.3 -
44
                regexpcountwithpositions(var('fp7negheavy'),prevpacksmall)*0.48 -
45
                regexpcountwithpositions(var('fp7neglight'),prevpacksmall)*0.18 -
46
                (((regexpcountwords(('\b_*\d+_*\b'),prevpacksmall)+ (regexpcountwords(('\b_*\d+_*\b'),nextpack)))/4))*0.2 -
47
                regexpcountwithpositions(var('fp7neglight'),nextpack)*0.03 -
48
                regexpcountwithpositions(var('fp7negheavy'),nextpack,1)*0.08
49
       end as confidence
50
                from
51
                ( select id,fundingClass,docid,normalizedacro, j2s(prev14,prev15) as prev,grantid,prev15,j2s(prev1, prev2, prev3, prev4, prev5,prev6,prev7,prev8,prev9,prev10,prev11,prev12,prev13,prev14,prev15) as prevpack ,j2s(prev9,prev10,prev11,prev12,prev13,prev14,prev15) as prevpacksmall , middle, j2s(next1, next2, next3) as nextpack
52
                    from
53
                        ( 
54
                         select * from (setschema 'docid,prev1, prev2, prev3, prev4, prev5, prev6, prev7, prev8, prev9, prev10, prev11, prev12, prev13, prev14, prev15, middle, next1, next2, next3' select  c1 as docid ,textwindow(regexpr('(\b\S*?[^0-9\s_]\S*?\s_?)(\d{3})(\s)(\d{3})(_?\s\S*?[^0-9\s_]\S*?\b)',filterstopwords(normalizetext(lower(c2))),'\1\2\4\5'),15,3,'((?:(?:\b|\D)0|_|\b|\D)(?:\d{5}))|(((\D|\b)\d{6}(\D|\b))) ' )
55
                            from   (select * from  pubs where c2 is not null)) ,grants
56
                            where  (not regexprmatches( '(?:0|\D|\b)+(?:\d{8,})',middle) and regexpr('(?:0|\D|\b)+(\d{5})',middle) = grantid) or ((not regexprmatches('(\d{6,}(?:\d|i\d{3}_?\b))|(jana\d{6,})', middle))
57
                        and regexpr('(\d{6})',middle) = grantid)
58
                        )
59
                      ) where confidence > 0) group by docid,id);
(2-2/2)