Created
October 22, 2018 13:12
-
-
Save HenleyChiu/b6e8c8913ca6fc20fc1aa5b7a47dfe91 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
domainWide = "false"; resultType = "total"; page = 0; export = nil; beginDate = (Time.now - 12.months).to_i; endDate = Time.now.to_i; numDays = nil; articleTypes = SearchConstants::ALL_ARTICLE_TYPES; ignoreDomains = []; restrictDomains = []; restrictUniqueDomain = true; offset = 0; restrictLinkPeriod = false | |
url = "http://www.euro.who.int/en/publications/abstracts/adolescent-alcohol-related-behaviours-trends-and-inequalities-in-the-who-european-region,-20022014-2018" | |
result = searchLinks(url, domainWide, resultType, page, export, beginDate, endDate, numDays, articleTypes, ignoreDomains, restrictDomains, restrictUniqueDomain, offset, restrictLinkPeriod) | |
def searchLinks(url, domainWide = "false", resultType = "total", page = 0, export = nil, beginDate = (Time.now - 12.months).to_i, endDate = Time.now.to_i, numDays = nil, articleTypes = SearchConstants::ALL_ARTICLE_TYPES, ignoreDomains = [], restrictDomains = [], restrictUniqueDomain = true, offset = 0, restrictLinkPeriod = false) | |
keywords = [] | |
negativeDomainsArray = [] | |
keyword = nil | |
negativeDomains = nil | |
keyword, negativeDomains = AlertCreator.new.normalizeUrl(url) | |
s = nil | |
uniqueResultPerDomain = restrictUniqueDomain | |
articleIds = [] | |
if(numDays != nil) | |
beginDate = (Time.now - numDays.to_i.days).to_i | |
endDate = Time.now.to_i | |
end | |
if(domainWide.to_s == "false") | |
#if url doesn't start with http or https:"pinterest.com DOMAIN_END:-491850732" | |
if(keyword.slice(0, 4).to_s.downcase != "http") | |
keyword = "http://" + keyword.to_s | |
end | |
s = LinksSearcher.new.getUrlSearchObject(keyword) | |
s.size 10000 | |
s.fields ["id"] | |
s.sort { by "id", "desc" } | |
if(restrictDomains.size > 0) | |
puts "Restricting domain to: " + restrictDomains.join(",").to_s | |
s.filter :terms, {"domain_name" => restrictDomains} | |
end | |
s.filter :range, {"published_date" => {"gte" => beginDate}} | |
s.filter :range, {"published_date" => {"lte" => endDate}} | |
SearchUtil.addNegativeDomains(s, ignoreDomains) | |
curl = s.to_curl | |
puts "URL Links search: #{curl}" | |
Rails.logger.info("Links Search: " + curl.to_s) | |
else #sitewide domain | |
origUrl = url | |
GeneralLog.log(@@className, "Sitewide domain for #{origUrl}") | |
url = Addressable::URI.encode(url) | |
url = "http://#{url}" if Addressable::URI.parse(url).scheme.nil? | |
host = Addressable::URI.parse(url).host.downcase | |
url = host.start_with?('www.') ? "DOMAIN_END " + host[4..-1] : host | |
url = url.gsub(".", " ") + " DOMAIN_END" | |
puts "URL: #{url}" | |
url = url.gsub("-", "buzzsumodelimiter") | |
puts "Link Domain to search: " + url.to_s | |
ignoreStr = "" | |
if(ignoreDomains == nil or ignoreDomains.size == 0) | |
ignoreDomains = ["randomdomaindomain.com"] | |
end | |
ignoreDomains.each do |ignoreDomain| | |
ignoreStr = ignoreStr + "{\"not\":{\"term\":{\"domain_name\":\"#{ignoreDomain}\"}}}," | |
ignoreStr = ignoreStr + "{\"not\":{\"term\":{\"subdomain\":\"#{ignoreDomain}\"}}}," | |
end | |
ignoreStr = ignoreStr + "{\"range\":{\"published_date\":{\"gte\":#{beginDate}}}}," | |
ignoreStr = ignoreStr + "{\"range\":{\"published_date\":{\"lte\":#{endDate}}}}" | |
restrictStr = "" | |
if(restrictDomains.size > 0) | |
restrictStr = restrictStr + "{\"terms\":{\"domain_name\":[\"#{restrictDomains.join(",")}\"]}}," | |
end | |
ignoreStr = ignoreStr.chomp(",") | |
combinedStr = restrictStr + ignoreStr | |
curl = "curl -X GET http://" +ENV["ELASTIC_SEARCH_IP"].to_s + ":9200/" + SearchConstants::ALL_LINKS_INDICES.join(',') + "/_search?pretty -d '{\"size\" : 10000, \"sort\" : [{\"id\": {\"order\": \"desc\"}}], \"fields\" : [\"id\"], \"query\":{ \"constantScore\" : { \"filter\" : { \"query\": { \"filtered\" : { \"query\" : { \"query_string\" : { \"query\": \"\\\"#{url}\\\"\",\"fields\" : [\"domains\"], \"phrase_slop\" : 0 } },\"filter\":{\"and\":[#{combinedStr}]} } } } }} }'" | |
puts "Links Curl: #{curl}" | |
Rails.logger.info("Links Search: " + curl.to_s) | |
end | |
cachedArticleIds = nil | |
results = json = ipUsed = nil | |
if(cachedArticleIds == nil) | |
Rails.logger.info("Links: Getting Links Results") | |
results, json, ipUsed = SearchUtil.getTireResults(curl, "links") | |
articleIds = [] | |
Rails.logger.info("Links: Finished Getting Links Results") | |
results.each do |result| | |
articleIds << result.id.to_i | |
end | |
else | |
articleIds = cachedArticleIds.split(",") | |
end | |
Rails.logger.info("Links: Article Ids that match: " + articleIds.join(",")) | |
$redis.setex("links_#{curl}", 60 * 10, articleIds.join(",")) | |
return ArticlesSearcher.new.getArticles(articleIds, resultType, page, export, beginDate, endDate, uniqueResultPerDomain, articleTypes, offset)[0] | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment