Last active
November 3, 2021 16:04
-
-
Save jeremyf/662585f5c4d22184a6ae133a71bf891a to your computer and use it in GitHub Desktop.
Articles::Feeds::WeightedQueryStrategy class as of 2021-11-3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Articles | |
module Feeds | |
# @api private | |
# | |
# This is an experimental object that we're refining to be a | |
# competetor to the existing feed strategies. | |
# | |
# It works to implement conceptual parity with two methods of | |
# Articles::Feeds::LargeForemExperimental: | |
# | |
# - #default_home_feed | |
# - #more_comments_minimal_weight_randomized | |
# | |
# What do we mean by "conceptual parity"? Those two methods are | |
# used in the two feeds controllers: StoriesController and | |
# Stories::FeedsController. And while they use some of the | |
# internal tooling there's some notable subtle differences. | |
# | |
# Where this class differs is that it is aiming to build the feed | |
# based from the given user's perspective. Whereas the other Feed | |
# algorithm starts with a list of candidates that are global to | |
# the given Forem (e.g., starting the base query from the | |
# `articles.score`, a volatile and swingy value that favors global | |
# reactions over user desired content). | |
# | |
# This is not quite a chronological only feed but could be easily | |
# modified to favor that. | |
# | |
# @note One possible short-coming is that the query does not | |
# account for the Forem's administrators. | |
# @note For those considering extending this, be very mindful of | |
# SQL injection. | |
class WeightedQueryStrategy | |
# This constant defines the allowable relevance scoring methods. | |
# | |
# A scoring method should be a SQL fragement that produces a | |
# value between 0 and 1. The closer the value is to 1, the more | |
# relevant the article is for the given user. Note: the values | |
# are multiplicative. Make sure to consider if you want a 0 | |
# multiplier for your score. Aspirationally, you may want to | |
# think of the relevance_score as the range (0,1]. That is | |
# greater than 0 and less than or equal to 1. | |
# | |
# In addition, as part of initialization, the caller can | |
# configure each of the scoring methods :cases and :fallback. | |
# | |
# Each scoring method has the following keys: | |
# | |
# - clause: The SQL clause statement; note: there exists a | |
# coupling between the clause and the SQL fragments | |
# that join the various tables. Also, under no | |
# circumstances should you allow any user value for | |
# this, as it is not something we can sanitize. | |
# | |
# - cases: An Array of Arrays, the first value is what matches | |
# the clause, the second value is the multiplicative | |
# factor. | |
# | |
# - fallback: When no case is matched use this factor. | |
# | |
# - requires_user: Does this scoring method require a given | |
# user. If not, don't use it if we don't have | |
# a nil user. | |
SCORING_METHOD_CONFIGURATIONS = { | |
# Weight to give based on the age of the article. | |
daily_decay_factor: { | |
clause: "(current_date - published_at::date)", | |
cases: [ | |
[0, 1], [1, 0.95], [2, 0.9], | |
[3, 0.85], [4, 0.8], [5, 0.75], | |
[6, 0.7], [7, 0.65], [8, 0.6], | |
[9, 0.55], [10, 0.5], [11, 0.4], | |
[12, 0.3], [13, 0.2], [14, 0.1] | |
], | |
fallback: 0.001, | |
requires_user: false | |
}, | |
# Weight to give for the number of comments on the article | |
# from other users that the given user follows. | |
comment_count_by_those_followed_factor: { | |
clause: "COUNT(comments_by_followed.id)", | |
cases: [[0, 0.95], [1, 0.98], [2, 0.99]], | |
fallback: 0.93, | |
requires_user: true | |
}, | |
# Weight to give to the number of comments on the article. | |
comments_count_factor: { | |
clause: "articles.comments_count", | |
cases: [[0, 0.9], [1, 0.94], [2, 0.95], [3, 0.98], [4, 0.999]], | |
fallback: 1, | |
requires_user: false | |
}, | |
# Weight to give based on the difference between experience | |
# level of the article and given user. | |
experience_factor: { | |
clause: "ROUND(ABS(articles.experience_level_rating - (SELECT | |
(CASE | |
WHEN experience_level IS NULL THEN :default_user_experience_level | |
ELSE experience_level END ) AS user_experience_level | |
FROM users_settings WHERE users_settings.user_id = :user_id | |
)))", | |
cases: [[0, 1], [1, 0.98], [2, 0.97], [3, 0.96], [4, 0.95], [5, 0.94]], | |
fallback: 0.93, | |
requires_user: true | |
}, | |
# Weight to give when the given user follows the article's | |
# author. | |
following_author_factor: { | |
clause: "COUNT(followed_user.follower_id)", | |
cases: [[0, 0.8], [1, 1]], | |
fallback: 1, | |
requires_user: true | |
}, | |
# Weight to give to the when the given user follows the | |
# article's organization. | |
following_org_factor: { | |
clause: "COUNT(followed_org.follower_id)", | |
cases: [[0, 0.95], [1, 1]], | |
fallback: 1, | |
requires_user: true | |
}, | |
# Weight to give an article based on it's most recent comment. | |
latest_comment_factor: { | |
clause: "(current_date - MAX(comments.created_at)::date)", | |
cases: [[0, 1], [1, 0.9988]], | |
fallback: 0.988, | |
requires_user: false | |
}, | |
# Weight to give for the number of intersecting tags the given | |
# user follows and the article has. | |
matching_tags_factor: { | |
clause: "COUNT(followed_tags.follower_id)", | |
cases: [[0, 0.4], [1, 0.9]], | |
fallback: 1, | |
requires_user: true | |
}, | |
# Weight to give for the number of reactions on the article. | |
reactions_factor: { | |
clause: "articles.reactions_count", | |
cases: [[0, 0.9988], [1, 0.9988], [2, 0.9988], [3, 0.9988]], | |
fallback: 1, | |
requires_user: false | |
}, | |
# Weight to give based on spaminess of the article. | |
spaminess_factor: { | |
clause: "articles.spaminess_rating", | |
cases: [[0, 1]], | |
fallback: 0, | |
requires_user: false | |
} | |
}.freeze | |
DEFAULT_USER_EXPERIENCE_LEVEL = 5 | |
# @param user [User] who are we querying for? | |
# @param number_of_articles [Integer] how many articles are we | |
# returning | |
# @param page [Integer] what is the pagination page | |
# @param tag [String, nil] this isn't implemented in other feeds | |
# so we'll see | |
# @param config [Hash<Symbol, Object>] a list of configurations, | |
# see {#initialize} implementation details. | |
# @option config [Array<Symbol>] :scoring_configs | |
# allows for you to configure which methods you want to use. | |
# This is most relevant when running A/B testing. | |
# @option config [Integer] :most_number_of_days_to_consider | |
# defines the oldest published date that we'll consider. | |
# | |
# @todo I envision that we will tweak the factors we choose, so | |
# those will likely need some kind of structured consideration. | |
def initialize(user: nil, number_of_articles: 50, page: 1, tag: nil, **config) | |
@user = user | |
@number_of_articles = number_of_articles.to_i | |
@page = page.to_i | |
# TODO: The tag parameter is vestigial, there's no logic around this value. | |
@tag = tag | |
@default_user_experience_level = config.fetch(:default_user_experience_level) { DEFAULT_USER_EXPERIENCE_LEVEL } | |
@oldest_published_at = determine_oldest_published_at( | |
user: @user, | |
most_number_of_days_to_consider: config.fetch(:most_number_of_days_to_consider, 31), | |
) | |
@scoring_configs = config.fetch(:scoring_configs) { default_scoring_configs } | |
configure!(scoring_configs: @scoring_configs) | |
end | |
# The goal of this query is to generate a list of articles that | |
# are relevant to the user's interest. | |
# | |
# First we give a score to an article based on it's publication | |
# date. The max possible score is 1. | |
# | |
# Then we begin multiplying that score by numbers between 0 and | |
# 1. The closer that multiplier is to 1 the "more relevant" | |
# that factor is to the user. | |
# | |
# @param only_featured [Boolean] select only articles that are | |
# "featured" | |
# @param must_have_main_image [Boolean] select only articles that have | |
# a main image. | |
# @param limit [Integer] the number of records to return | |
# @param offset [Integer] start the paging window at the given offset | |
# @param omit_article_ids [Array] don't include these articles in the search results | |
# | |
# @return ActiveRecord::Relation for Article | |
# rubocop:disable Layout/LineLength | |
def call(only_featured: false, must_have_main_image: false, limit: default_limit, offset: default_offset, omit_article_ids: []) | |
unsanitized_sub_sql = if @user.nil? | |
[ | |
sql_sub_query_for_nil_user( | |
only_featured: only_featured, | |
must_have_main_image: must_have_main_image, | |
limit: limit, | |
offset: offset, | |
omit_article_ids: omit_article_ids, | |
), | |
{ | |
oldest_published_at: @oldest_published_at, | |
omit_article_ids: omit_article_ids | |
}, | |
] | |
else | |
[ | |
sql_sub_query_for_existing_user( | |
only_featured: only_featured, | |
must_have_main_image: must_have_main_image, | |
limit: limit, | |
offset: offset, | |
omit_article_ids: omit_article_ids, | |
), | |
{ | |
user_id: @user.id, | |
default_user_experience_level: @default_user_experience_level.to_i, | |
oldest_published_at: @oldest_published_at, | |
omit_article_ids: omit_article_ids | |
}, | |
] | |
end | |
# This sub-query allows us to take the hard work of the | |
# hand-coded unsanitized sql and create a sub-query that we | |
# can use to help ensure that we can use all of the | |
# ActiveRecord goodness of scopes (e.g., | |
# limited_column_select) and eager includes. | |
Article.where( | |
Article.arel_table[:id].in( | |
Arel.sql( | |
Article.sanitize_sql(unsanitized_sub_sql) | |
) | |
) | |
).limited_column_select.includes(top_comments: :user).order(published_at: :desc) | |
end | |
# rubocop:enable Layout/LineLength | |
# Provided as a means to align interfaces with existing feeds. | |
# | |
# @note I really dislike this method name as it is opaque on | |
# it's purpose. | |
alias more_comments_minimal_weight_randomized call | |
# The featured story should be the article that: | |
# | |
# - has the highest relevance score for the nil_user version | |
# - has `featured = true` | |
# - (OPTIONALLY) has a main image | |
# | |
# The other articles should use the nil_user version and require | |
# the `featured = true` attribute. In my envisioned | |
# implementation, the pagination would omit the featured story. | |
# | |
# @param feature_requires_image [Boolean] do we mandate that the | |
# featured story/stories require an image? | |
# @return [Array<Article, Array<Article>] a featured story | |
# Article and an array of Article objects. | |
# | |
# @note Per prior work, a featured story is the article that has | |
# a main image, is marked as featured (e.g., featured = | |
# true), and has the highest relevance score. In the | |
# Articles::Feeds::LargeForemExperimental object we used | |
# the hotness_score to determine which to use. The | |
# hotness score is most analogue to how this class | |
# calculates the relevance score when we don't have a | |
# user. | |
# @note There are requests to allow for the featured article to | |
# NOT require a main image. We're still talking through | |
# what that means. | |
def featured_story_and_default_home_feed(feature_requires_image: true) | |
# We could parameterize this, but callers would need to | |
# consider the impact of that decision, and it would break the | |
# current contract. | |
number_of_featured_stories = 1 | |
featured_story = call( | |
feature_requires_image: feature_requires_image, | |
only_featured: true, | |
limit: number_of_featured_stories, | |
offset: 0, | |
) | |
articles = call( | |
feature_requires_image: feature_requires_image, | |
only_featured: true, | |
# Make sure that we don't include the featured_story | |
omit_article_ids: [featured_story.id], | |
) | |
[featured_story, articles] | |
end | |
# @note In the LargeForemExperimental implementation, the | |
# default home feed omits the featured story. In this | |
# case, I don't want to do that. Instead, I want to see | |
# how this behaves. | |
def default_home_feed(feature_requires_image: true, **) | |
call( | |
feature_requires_image: feature_requires_image, | |
only_featured: true, | |
) | |
end | |
private | |
# The sql statement for selecting based on relevance scores that | |
# are for nil users. | |
def sql_sub_query_for_nil_user(only_featured:, must_have_main_image:, limit:, offset:, omit_article_ids:) | |
where_clause = build_sql_with_where_clauses( | |
only_featured: only_featured, | |
must_have_main_image: must_have_main_image, | |
omit_article_ids: omit_article_ids, | |
) | |
<<~THE_SQL_STATEMENT | |
SELECT articles.id | |
FROM articles | |
LEFT OUTER JOIN comments | |
ON comments.commentable_id = articles.id | |
AND comments.commentable_type = 'Article' | |
WHERE #{where_clause} | |
GROUP BY articles.id | |
ORDER BY (#{relevance_score_components_as_sql}) DESC, | |
articles.published_at DESC | |
#{offset_and_limit_clause(offset: offset, limit: limit)} | |
THE_SQL_STATEMENT | |
end | |
# The sql statement for selecting based on relevance scores that | |
# are user required. | |
def sql_sub_query_for_existing_user(only_featured:, must_have_main_image:, limit:, offset:, omit_article_ids:) | |
where_clause = build_sql_with_where_clauses( | |
only_featured: only_featured, | |
must_have_main_image: must_have_main_image, | |
omit_article_ids: omit_article_ids, | |
) | |
<<~THE_SQL_STATEMENT | |
SELECT articles.id | |
FROM articles | |
LEFT OUTER JOIN taggings | |
ON taggings.taggable_id = articles.id | |
AND taggable_type = 'Article' | |
INNER JOIN tags | |
ON taggings.tag_id = tags.id | |
LEFT OUTER JOIN follows AS followed_tags | |
ON tags.id = followed_tags.followable_id | |
AND followed_tags.followable_type = 'ActsAsTaggableOn::Tag' | |
AND followed_tags.follower_type = 'User' | |
AND followed_tags.follower_id = :user_id | |
LEFT OUTER JOIN follows AS followed_user | |
ON articles.user_id = followed_user.followable_id | |
AND followed_user.followable_type = 'User' | |
AND followed_user.follower_id = :user_id | |
AND followed_user.follower_type = 'User' | |
LEFT OUTER JOIN follows AS followed_org | |
ON articles.organization_id = followed_org.followable_id | |
AND followed_org.followable_type = 'Organization' | |
AND followed_org.follower_id = :user_id | |
AND followed_org.follower_type = 'User' | |
LEFT OUTER JOIN comments AS comments_by_followed | |
ON comments_by_followed.commentable_id = articles.id | |
AND comments_by_followed.commentable_type = 'Article' | |
AND followed_user.followable_id = comments_by_followed.user_id | |
AND followed_user.followable_type = 'User' | |
LEFT OUTER JOIN user_blocks | |
ON user_blocks.blocked_id = articles.user_id | |
AND user_blocks.blocked_id IS NULL | |
AND user_blocks.blocker_id = :user_id | |
LEFT OUTER JOIN comments | |
ON comments.commentable_id = articles.id | |
AND comments.commentable_type = 'Article' | |
WHERE #{where_clause} | |
GROUP BY articles.id | |
ORDER BY (#{relevance_score_components_as_sql}) DESC, | |
articles.published_at DESC | |
#{offset_and_limit_clause(offset: offset, limit: limit)} | |
THE_SQL_STATEMENT | |
end | |
# @todo Do we want to favor published at for scoping, or do we | |
# want to consider `articles.last_comment_at`. If we do, | |
# we must remember to add an index to that field. | |
def build_sql_with_where_clauses(only_featured:, must_have_main_image:, omit_article_ids:) | |
where_clauses = "articles.published = true AND articles.published_at > :oldest_published_at" | |
where_clauses += " AND articles.id NOT IN (:omit_article_ids)" unless omit_article_ids.empty? | |
where_clauses += " AND articles.featured = true" if only_featured | |
where_clauses += " AND articles.main_image IS NOT NULL" if must_have_main_image | |
where_clauses | |
end | |
def offset_and_limit_clause(offset:, limit:) | |
if offset.to_i.positive? | |
Article.sanitize_sql_array(["OFFSET ? LIMIT ?", offset, limit]) | |
else | |
Article.sanitize_sql_array(["LIMIT ?", limit]) | |
end | |
end | |
def determine_oldest_published_at(user:, most_number_of_days_to_consider: 31) | |
user&.page_views&.second_to_last&.created_at || most_number_of_days_to_consider.days.ago | |
end | |
# We multiply the relevance score components together. | |
def relevance_score_components_as_sql | |
@relevance_score_components.join(" * ") | |
end | |
def default_limit | |
@number_of_articles.to_i | |
end | |
def default_offset | |
return 0 if @page == 1 | |
@page.to_i - (1 * default_limit) | |
end | |
# By default, we use all of the possible scoring methods. | |
def default_scoring_configs | |
SCORING_METHOD_CONFIGURATIONS | |
end | |
# This method converts the caller provided :scoring_configs into | |
# an array of SQL clause fragments. | |
# | |
# @param scoring_configs [Hash] the caller provided configurations. | |
# | |
# @see SCORING_METHOD_CONFIGURATIONS | |
# @note Be mindful to guard against SQL injection! | |
def configure!(scoring_configs:) | |
@relevance_score_components = [] | |
# We looping through the possible scoring method | |
# configurations, we're only accepting those as valid | |
# configurations. | |
SCORING_METHOD_CONFIGURATIONS.each_pair do |valid_method_name, default_config| | |
# Don't attempt to use this factor if we don't have user. | |
next if default_config.fetch(:requires_user) && @user.nil? | |
# Ensure that we're only using a scoring configuration that | |
# the caller provided. | |
next unless scoring_configs.key?(valid_method_name) | |
scoring_config = scoring_configs.fetch(valid_method_name) | |
# If the caller didn't provide a hash for this scoring configuration, | |
# then we'll use the default configuration. | |
scoring_config = default_config unless scoring_config.is_a?(Hash) | |
@relevance_score_components << build_score_element_from( | |
# Under NO CIRCUMSTANCES should you trust the caller to | |
# provide a valid :clause. Don't trust them to send a | |
# valid clause. That's the path of SQL injection. | |
clause: default_config.fetch(:clause), | |
# We can trust the :cases and :fallback a bit more, as we | |
# later cast them to integers and floats. | |
cases: scoring_config.fetch(:cases), | |
fallback: scoring_config.fetch(:fallback), | |
) | |
end | |
end | |
# Responsible for transforming the :clause, :cases, and | |
# :fallback into a SQL fragment that we can use to multiply with | |
# the other SQL fragments. | |
# | |
# @param clause [String] | |
# @param cases [Array<Array<#to_i, #to_f>>] | |
# @param fallback [#to_f] | |
def build_score_element_from(clause:, cases:, fallback:) | |
values = [] | |
# I would love to sanitize this, but alas, we must trust this | |
# clause. | |
text = "(CASE #{clause}" | |
cases.each do |value, factor| | |
text += "\nWHEN ? THEN ?" | |
values << value.to_i | |
values << factor.to_f | |
end | |
text += "\nELSE ? END)" | |
values << fallback.to_f | |
values.unshift(text) | |
Article.sanitize_sql_array(values) | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment