Google is the most used search engine (92% market share). We wanted to know how Google search results compared to other search engines, namely Bing, DuckDuckGo, and Yahoo).
We analysed about 56k random keyword searches for 4 search engines including Google, and came up with some insights based on the 30 top ranked results.
The results of Google’s competitors are more similar between themselves than similar to Google’s. DuckDuckGo is the most similar to Google (31%) and Yahoo is the least similar (29%).
For all search engines, the similarity doesn’t change much with search volume, though search engine results are consistently more similar to Google’s for medium volume (between 10k and 100k) volume, with gains between 1.5% and 2% when comparing volume of more and less than 10k.
Similarity differs by category. Real Estate and Travel & Tourism are more similar across all search engines, while Apparel related results are less similar.
Similarity is higher for long searches.
The different search engines feature in different amounts big domains such as amazon in their top result.
Searches that tend to yield results predominantly linking to a specific domain (domain specific), OR non domain specific at all, show lower similarity to Google on average.
In more than half of the cases in any search engine, Google’s top result is found in position 1 to 3. For 25% to 35% of keywords for a given search engine, Google’s top result won’t be found in the top 30.
library(tidyverse) # package used for data wrangling
library(ggtext) # for text formatting
library(ggforce) # for donut charts
library(patchwork) # for combining plots together
# options(dplyr.summarise.inform = F)
# "datasets_for_report.Rdata" is built by the following commented calls
# source(here::here("scripts/01_build_light_dataset.R"))
# source(here::here("scripts/02_build_datasets_for_report.R"))
# source(here::here("scripts/03_similarity_metrics.R"))
load(here::here("proc_data/datasets_for_report.Rdata"))
# Set theme for plots
theme_set(theme_minimal(base_size = 12, base_family = "Poppins"))
theme_update(
plot.title.position = "plot",
plot.title = element_text(face = "bold", margin = margin(b = 10)),
plot.margin = margin(10, 20, 10, 20),
plot.background = element_rect(fill = "#D3D2F9", color = NA),
legend.position = "none",
axis.title.x.bottom = element_text(color = "grey60", size = rel(0.7), hjust = 1, margin = margin(t = 5), face = "bold"),
axis.title.y.left = element_text(color = "grey60", size = rel(0.7), hjust = 1, margin = margin(r = 5), face = "bold"),
axis.text = element_text(size = rel(0.75)),
strip.background = element_rect(fill = "#807FFF", color = "white", size = 1.5),
strip.text = element_text(color = "white", face = "bold"),
panel.grid.minor = element_blank(),
panel.spacing.x = unit(2.5, "lines"),
panel.spacing.y = unit(2.5, "lines")
)
Due to technicalities fetching data with the API, 3% of our search results among the top 30 results of our search engines could not be identified, i.e. we miss observations.
We believe this shouldn’t impact sensibly our learnings, and since removing all keywords showing an anomaly would drastically affect the sample size we build artificial observations for these cases so we end up with 6,717,480 observations, amounting to 55,979 keywords times 4 search_engines times 30 results.
We define similarity between two search engines as the fraction of results (identified by url) that we find in both top 10 results of the pair.
We see below that on average 70% of top 10 results on yahoo can be also found in the top 10 of Bing, making them the most similar pair.
Google is very dissimilar to other search engines, especially Yahoo, its shares only 29% of top 10 results wit the latter. It is most similar to DuckDuckGo, with a similarity indicator of 31%.
se_list <- c("Google", "Yahoo", "Bing", "DuckDuckGo")
similarity_by_se_10 <-
similarity_by_se_10 %>%
mutate(
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine),
search_engine2 = if_else(search_engine2 == "DuckDuck", "DuckDuckGo", search_engine2)
)
df <-
crossing(se_1 = se_list, se_2 = se_list) %>%
left_join(select(similarity_by_se_10, -descr),
by = c("se_1" = "search_engine", "se_2" = "search_engine2")) %>%
left_join(select(similarity_by_se_10, -descr),
by = c("se_1" = "search_engine2", "se_2" = "search_engine")) %>%
mutate(
similarity = coalesce(similarity.x, similarity.y),
se_1 = factor(se_1, levels = c("Yahoo", "Bing", "DuckDuckGo", "Google")),
se_2 = factor(se_2, levels = c("Yahoo", "Bing", "DuckDuckGo", "Google")),
include_exclude = case_when(
se_1 == se_2 ~ "exclude",
se_1 == "DuckDuckGo" & se_2 == "Bing" ~ "exclude",
TRUE ~ "include"
),
label = ifelse(is.na(similarity), NA, glue::glue("{round(similarity * 100, 0)}%"))
) %>%
select(se_1, se_2, similarity, include_exclude, label)
p <-
ggplot(df, aes(x = se_1, y = se_2, fill = similarity)) +
geom_tile(color = "#D3D2F9", size = 2.5) +
geom_tile(data = filter(df, include_exclude == "exclude"), fill = "#D3D2F9") +
geom_text(data = filter(df, include_exclude == "include"),
aes(label = label), size = 7, family = "Poppins", fontface = "bold", color = "white") +
scale_x_discrete(limits = c("Yahoo", "Bing", "DuckDuckGo")) +
scale_y_discrete(limits = c("Google", "DuckDuckGo", "Bing")) +
scale_fill_gradient(low = "#C2C2FF", high = "#7426da", limits = c(0.2, 0.8), breaks = c(0.2, 0.5, 0.8), labels = scales::percent_format(accuracy = 1), name = "Similarity") +
guides(fill = guide_colorsteps(title.position = "top", title.hjust = 0.5)) +
labs(
title = "Yahoo, Bing and DuckDuckGo give very different results",
x = "",
y = ""
) +
theme(
panel.grid = element_blank(),
plot.title = element_text(size = rel(1.6)),
axis.text = element_text(size = rel(1.2), face = "bold"),
legend.position = c(0.85, 0.8),
legend.direction = "horizontal",
legend.key.width = unit(10, "mm"),
legend.key.height = unit(2, "mm")
)
ragg::agg_png(here::here("plots", "plot_01_plot_ranking_similarity.png"), width = 8, height = 8, units = "in", res = 320)
print(p)
dev.off()
We show below how these values change when we compute the similarity by considering the top 3, top 10, top 20 and top 30.
We see overall that generally the higher the number of top results considered, the least similar rankings will be. It makes sense as the higher we are on the page the less random should be the results, and those should converge between search engines.
We see that many of google’s results are not found by its competitors (and inversely), in particular on average about only 25% of Google’s top 30 results are found in the top 30 of its competitors.
se_list <- c("Google", "Yahoo", "Bing", "DuckDuckGo")
plot_similarity_between_se <- function(similarity_by_se, results = "Top 10") {
df <-
crossing(se_1 = se_list, se_2 = se_list) %>%
left_join(select(similarity_by_se, -descr),
by = c("se_1" = "search_engine", "se_2" = "search_engine2")) %>%
left_join(select(similarity_by_se, -descr),
by = c("se_1" = "search_engine2", "se_2" = "search_engine")) %>%
mutate(
similarity = coalesce(similarity.x, similarity.y),
se_1 = factor(se_1, levels = c("Yahoo", "Bing", "DuckDuckGo", "Google")),
se_2 = factor(se_2, levels = c("Yahoo", "Bing", "DuckDuckGo", "Google")),
include_exclude = case_when(
se_1 == se_2 ~ "exclude",
se_1 == "DuckDuckGo" & se_2 == "Bing" ~ "exclude",
TRUE ~ "include"
),
label = ifelse(is.na(similarity), NA, glue::glue("{round(similarity * 100, 0)}%"))
) %>%
select(se_1, se_2, similarity, include_exclude, label)
ggplot(df, aes(x = se_1, y = se_2, fill = similarity)) +
geom_tile(color = "#D3D2F9", size = 2.5) +
geom_tile(data = filter(df, include_exclude == "exclude"), fill = "#D3D2F9") +
geom_text(data = filter(df, include_exclude == "include"),
aes(label = label), size = 7, family = "Poppins", fontface = "bold", color = "white") +
scale_x_discrete(limits = c("Yahoo", "Bing", "DuckDuckGo")) +
scale_y_discrete(limits = c("Google", "DuckDuckGo", "Bing")) +
scale_fill_gradient(
low = "#C2C2FF", high = "#7426da", limits = c(0.2, 0.8), breaks = c(0.2, 0.5, 0.8),
labels = scales::percent_format(accuracy = 1), name = "Similarity"
) +
guides(fill = guide_colorsteps(title.position = "top", title.hjust = 0.5)) +
labs(
subtitle = results,
x = "",
y = ""
) +
theme(
panel.grid = element_blank(),
plot.subtitle = element_text(size = rel(1.2), face = "bold", color = "grey40", margin = margin(b = 5), hjust = 0.5),
axis.text = element_text(size = rel(1), face = "bold"),
legend.position = "top",
legend.direction = "horizontal",
legend.key.width = unit(10, "mm"),
legend.key.height = unit(2, "mm")
)
}
similarity_by_se_3 <-
similarity_by_se_3 %>%
mutate(
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine),
search_engine2 = if_else(search_engine2 == "DuckDuck", "DuckDuckGo", search_engine2)
)
similarity_by_se_10 <-
similarity_by_se_10 %>%
mutate(
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine),
search_engine2 = if_else(search_engine2 == "DuckDuck", "DuckDuckGo", search_engine2)
)
similarity_by_se_20 <-
similarity_by_se_20 %>%
mutate(
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine),
search_engine2 = if_else(search_engine2 == "DuckDuck", "DuckDuckGo", search_engine2)
)
similarity_by_se_30 <-
similarity_by_se_30 %>%
mutate(
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine),
search_engine2 = if_else(search_engine2 == "DuckDuck", "DuckDuckGo", search_engine2)
)
p1 <- plot_similarity_between_se(similarity_by_se_3, "Top 3 Results")
p2 <- plot_similarity_between_se(similarity_by_se_10, "Top 10 Results")
p3 <- plot_similarity_between_se(similarity_by_se_20, "Top 20 Results")
p4 <- plot_similarity_between_se(similarity_by_se_30, "Top 30 Results")
p <- (p1 + p2) / (p3 + p4) +
plot_layout(guides = "collect") +
plot_annotation(title = "Similarity between search engines") &
theme(
plot.title = element_text(size = rel(1.6)),
legend.position = c(0.9, 0.9)
)
ragg::agg_png(here::here("plots", "plot_02_plot_ranking_similarity_trend.png"), width = 16, height = 12, units = "in", res = 320)
print(p)
dev.off()
The sample of search keywords were chosen so we got a fairly even split across different search volumes.
## # A tibble: 3 x 2
## monthly_search_volume_level Count
## <fct> <chr>
## 1 500-1000 16,599
## 2 1000-10000 19,668
## 3 10000-100000 19,712
We show below how search volume affects the similarity of a search engine’s top results to Google’s top results, considering the top 10, top 20 or top 30.
Though the difference is slight, we see that top results are most similar to Google for medium volumes (1000-10000 searches per month).
se_labels <-
similarity_by_se_and_vol %>%
filter(monthly_search_volume_level == "10000-100000", grp == "top 10") %>%
mutate(
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine),
monthly_search_volume_level = factor(monthly_search_volume_level, labels = "10k-1m"),
)
p <-
similarity_by_se_and_vol %>%
mutate(
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine),
monthly_search_volume_level = factor(monthly_search_volume_level, labels = c("0.5k-1k", "1k-10k", "10k-1m"))
) %>%
ggplot(aes(monthly_search_volume_level, y = similarity, colour = search_engine,
group = search_engine)) +
geom_line() +
geom_point(aes(fill = search_engine), shape = 21, size = 4, color = "white", stroke = 0.6) +
ggrepel::geom_text_repel(data = se_labels, aes(label = search_engine), nudge_y = 0.0075, nudge_x = -0.35, size = 3, hjust = 0, fontface = "bold", family = "Poppins") +
scale_y_continuous(limits = c(0.2, 0.35), breaks = seq(0.2, 0.35, 0.05), labels = scales::label_percent(accuracy = 1), expand = expansion(0)) +
scale_x_discrete(expand = expansion(0.05)) +
scale_color_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
scale_fill_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
labs(title = "Obscure and popular keywords will get you the most variety by using multiple search engines
",
x = "Monthly Search Volume",
y = "Search Engine Similarity") +
facet_wrap(~grp, ncol = 3, labeller = labeller(grp = str_to_title)) +
coord_cartesian(clip = "off")
ragg::agg_png(here::here("plots", "plot_03_search_vol_similarity.png"), width = 9, height = 5, units = "in", res = 320)
print(p)
dev.off()
Each of the keywords comes with categories (or tags). It’s possible for keywords to belong to more than one category. The sample of keywords was drawn randomly so we don’t expect to have an equal amount in each category.
The most common categories are News, Media & Publications, Arts & Entertainment, Business & Industrial, and Hobbies & Leisure.
p <- category1_counts %>%
ggplot(aes(x = n, y = keyword_category)) +
# geom_col() +
geom_segment(aes(xend = -Inf, yend = keyword_category), size = 0.75, color = "grey95") +
geom_point(shape = 21, size = 4, fill = "#807FFF", stroke = 1.1, color = "grey95") +
scale_x_continuous(limits = c(0, NA), labels = scales::label_comma(), breaks = seq(4000, 16000, 4000)) +
labs(title = "# of Keywords by Category",
x = NULL,
y = NULL) +
theme(
panel.grid.major.y = element_blank()
)
ragg::agg_png(here::here("plots", "plot_04_keyword_category.png"), width = 9, height = 5, units = "in", res = 320)
print(p)
dev.off()
Deeper levels of categories are available, nested into the main categories, we show below the breakdown of the level 2 and 3 categories.
p1 <- category2_counts %>%
ggplot(aes(x = n, y = keyword_category)) +
geom_segment(aes(xend = -Inf, yend = keyword_category), size = 0.75, color = "grey95") +
geom_point(shape = 21, size = 4, fill = "#807FFF", stroke = 1.1, color = "grey95") +
scale_x_continuous(limits = c(0, 6000), labels = scales::label_comma(), breaks = seq(2000, 6000, 2000)) +
labs(title = "# of Keywords by Category",
subtitle = "Level 2",
x = NULL,
y = NULL) +
theme(
plot.subtitle = element_text(face = "bold", color = "grey40"),
panel.grid.major.y = element_blank()
)
p2 <- category3_counts %>%
ggplot(aes(x = n, y = keyword_category)) +
geom_segment(aes(xend = -Inf, yend = keyword_category), size = 0.75, color = "grey95") +
geom_point(shape = 21, size = 4, fill = "#807FFF", stroke = 1.1, color = "grey95") +
scale_x_continuous(limits = c(0, 6000), labels = scales::label_comma(), breaks = seq(2000, 6000, 2000)) +
labs(subtitle = "Level 3",
x = NULL,
y = NULL) +
theme(
plot.subtitle = element_text(face = "bold", color = "grey40"),
panel.grid.major.y = element_blank()
)
p <- p1 / p2
ragg::agg_png(here::here("plots", "plot_05_keyword_category_levels_2_3.png"), width = 9, height = 10, units = "in", res = 320)
print(p)
dev.off()
The following chart shows that there is significant variance in similarity depending on what category the keyword belongs too. Real Estate and Travel & Tourism are the categories where the top 10 between Google and other search engines is the most similar, while Apparel and Beauty & Personal Care is the category showing the most dissimilar top 10.
For every category Yahoo is the most dissimilar to Google and DuckDuckGo the most similar, and the difference in similarity between those goes up to about 3.5 % for some categories such as Health and Apparel.
## # A tibble: 66 x 4
## search_engine keyword_category similarity n
## <chr> <fct> <dbl> <int>
## 1 Yahoo News, Media & Publications 0.296 16370
## 2 Bing News, Media & Publications 0.311 16369
## 3 DuckDuck News, Media & Publications 0.319 16353
## 4 Bing Arts & Entertainment 0.298 14941
## 5 Yahoo Arts & Entertainment 0.281 14932
## 6 DuckDuck Arts & Entertainment 0.306 14929
## 7 Yahoo Business & Industrial 0.282 12398
## 8 Bing Business & Industrial 0.300 12395
## 9 DuckDuck Business & Industrial 0.309 12386
## 10 Yahoo Hobbies & Leisure 0.286 10288
## 11 Bing Hobbies & Leisure 0.305 10286
## 12 DuckDuck Hobbies & Leisure 0.315 10274
## 13 Bing Computers & Consumer Electronics 0.292 6180
## 14 Yahoo Computers & Consumer Electronics 0.275 6178
## 15 DuckDuck Computers & Consumer Electronics 0.300 6170
## 16 Yahoo Home & Garden 0.270 5844
## 17 Bing Home & Garden 0.294 5843
## 18 DuckDuck Home & Garden 0.301 5837
## 19 Bing Travel & Tourism 0.349 5300
## 20 Yahoo Travel & Tourism 0.329 5298
## 21 DuckDuck Travel & Tourism 0.363 5290
## 22 Yahoo Internet & Telecom 0.271 4923
## 23 Yahoo Sports & Fitness 0.276 4923
## 24 Bing Sports & Fitness 0.289 4922
## 25 Yahoo Family & Community 0.290 4919
## 26 Bing Family & Community 0.304 4918
## 27 Bing Internet & Telecom 0.285 4918
## 28 DuckDuck Sports & Fitness 0.301 4917
## 29 DuckDuck Family & Community 0.318 4912
## 30 DuckDuck Internet & Telecom 0.292 4910
## 31 Bing Health 0.314 4795
## 32 Yahoo Health 0.290 4792
## 33 DuckDuck Health 0.325 4788
## 34 Bing Jobs & Education 0.315 4494
## 35 Yahoo Jobs & Education 0.303 4492
## 36 DuckDuck Jobs & Education 0.327 4483
## 37 DuckDuck Apparel 0.289 4062
## 38 Yahoo Apparel 0.254 4060
## 39 Bing Apparel 0.278 4056
## 40 Yahoo Food & Groceries 0.304 3755
## 41 Bing Food & Groceries 0.323 3751
## 42 DuckDuck Food & Groceries 0.334 3739
## 43 Bing Vehicles 0.296 3135
## 44 Yahoo Vehicles 0.285 3135
## 45 DuckDuck Vehicles 0.310 3132
## 46 Bing Beauty & Personal Care 0.282 3060
## 47 DuckDuck Beauty & Personal Care 0.293 3058
## 48 Yahoo Beauty & Personal Care 0.261 3058
## 49 Bing Law & Government 0.300 2888
## 50 Yahoo Law & Government 0.290 2884
## 51 DuckDuck Law & Government 0.310 2876
## 52 Bing Real Estate 0.364 2482
## 53 Yahoo Real Estate 0.347 2482
## 54 DuckDuck Real Estate 0.374 2476
## 55 Bing Dining & Nightlife 0.318 2330
## 56 Yahoo Dining & Nightlife 0.310 2330
## 57 DuckDuck Dining & Nightlife 0.333 2325
## 58 Bing Finance 0.307 2281
## 59 Yahoo Finance 0.297 2280
## 60 DuckDuck Finance 0.317 2271
## 61 Bing Retailers & General Merchandise 0.305 2059
## 62 Yahoo Retailers & General Merchandise 0.294 2059
## 63 DuckDuck Retailers & General Merchandise 0.316 2056
## 64 Yahoo Occasions & Gifts 0.277 1940
## 65 Bing Occasions & Gifts 0.297 1938
## 66 DuckDuck Occasions & Gifts 0.306 1934
p <-
similarity_by_se_and_category %>%
mutate(
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine)
) %>%
ggplot(aes(x = similarity, y = keyword_category,
group = search_engine)) +
geom_point(aes(fill = search_engine), shape = 21, size = 5, stroke = 1.1, color = "grey95") +
scale_x_continuous(limits = c(0.2, 0.4), labels = scales::label_percent(accuracy = 1), expand = expansion(0), sec.axis = dup_axis(name = "")) +
scale_color_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
scale_fill_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0"), limits = c("Yahoo", "Bing", "DuckDuckGo")) +
coord_cartesian(clip = "off") +
guides(
fill = guide_legend(
title.position = "top",
title.hjust = 0.5,
label.theme = element_text(size = 9, face = "bold", color = "grey30"),
nrow = 1
)
) +
labs(
title = "Topic makes a huge difference in how similar search results are",
subtitle = "Based on Top 10 rankings",
colour = "Search Engine",
x = "Search Engine Similarity",
y = "",
fill = "Search Engine"
) +
theme(
plot.subtitle = element_text(size = rel(1.1), face = "bold", color = "grey40"),
legend.position = c(0.8, 1.08),
legend.title = element_text(size = rel(0.8), face = "bold", color = "grey60")
)
ragg::agg_png(here::here("plots", "plot_06_plot_google_similarity.png"), width = 9, height = 9, units = "in", res = 320)
print(p)
dev.off()
## png
## 2
We reproduce this chart for the 10 biggest subcategories of level 2.
## # A tibble: 30 x 4
## search_engine keyword_category similarity n
## <chr> <fct> <dbl> <int>
## 1 Yahoo Reference Materials & Resources 0.334 5091
## 2 Bing Reference Materials & Resources 0.349 5088
## 3 DuckDuck Reference Materials & Resources 0.356 5086
## 4 Bing TV & Video 0.294 5001
## 5 DuckDuck TV & Video 0.301 4999
## 6 Yahoo TV & Video 0.277 4999
## 7 Bing Computers 0.293 4712
## 8 Yahoo Computers 0.275 4709
## 9 DuckDuck Computers 0.300 4704
## 10 Bing Online Media 0.283 4648
## 11 Yahoo Online Media 0.266 4648
## 12 DuckDuck Online Media 0.291 4638
## 13 Yahoo Internet 0.272 4140
## 14 Bing Internet 0.285 4138
## 15 DuckDuck Internet 0.292 4130
## 16 DuckDuck Music & Audio 0.302 3661
## 17 Bing Music & Audio 0.294 3660
## 18 Yahoo Music & Audio 0.282 3659
## 19 Bing Education & Training 0.317 3521
## 20 Yahoo Education & Training 0.303 3520
## 21 DuckDuck Education & Training 0.330 3514
## 22 Bing Health Conditions & Concerns 0.320 3101
## 23 Yahoo Health Conditions & Concerns 0.290 3100
## 24 DuckDuck Health Conditions & Concerns 0.331 3098
## 25 Yahoo Consumer Electronics 0.268 3064
## 26 Bing Consumer Electronics 0.286 3062
## 27 Bing Sports 0.292 3061
## 28 DuckDuck Sports 0.301 3059
## 29 Yahoo Sports 0.277 3059
## 30 DuckDuck Consumer Electronics 0.296 3057
p <-
similarity_by_se_and_category_2 %>%
mutate(
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine)
) %>%
ggplot(aes(x = similarity, y = keyword_category,
group = search_engine)) +
geom_point(aes(fill = search_engine), shape = 21, size = 5, stroke = 1.1, color = "grey95") +
scale_x_continuous(limits = c(0.2, 0.4), labels = scales::label_percent(accuracy = 1), expand = expansion(0), sec.axis = dup_axis(name = "")) +
scale_y_discrete(expand = expansion(0.03)) +
scale_color_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
scale_fill_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0"), limits = c("Yahoo", "Bing", "DuckDuckGo")) +
coord_cartesian(clip = "off") +
guides(
fill = guide_legend(
title.position = "top",
title.hjust = 0.5,
label.theme = element_text(size = 9, face = "bold", color = "grey30"),
nrow = 1
)
) +
labs(
title = "Search Engine similarity to Google by Keyword Category (Level 2)",
subtitle = "Based on Top 10 rankings",
colour = "Search Engine",
x = "Search Engine Similarity",
y = "",
fill = "Search Engine"
) +
theme(
plot.subtitle = element_text(size = rel(1.1), face = "bold", color = "grey40"),
legend.position = c(0.8, 1.12),
legend.title = element_text(size = rel(0.8), face = "bold", color = "grey60")
)
ragg::agg_png(here::here("plots", "plot_07_plot_google_similarity_level_2.png"), width = 9, height = 6, units = "in", res = 320)
print(p)
dev.off()
## png
## 2
Finally we offer a breakdown of the accuracy by main category and volume category.
We see that volume can make a big difference in some cases, for instance for Health related keywords Bing and DuckDuckGo become much more accurate as volume increases.
We see also that Real Estate related searches yield the rankings most similar to google compared to other categories at low and medium volume, but not at high volume.
## # A tibble: 198 x 5
## search_engine keyword_category monthly_search_volu~ similarity n
## <chr> <fct> <fct> <dbl> <int>
## 1 DuckDuck Real Estate 500-1000 0.396 911
## 2 Bing Real Estate 500-1000 0.387 915
## 3 DuckDuck Real Estate 1000-10000 0.383 940
## 4 Bing Real Estate 1000-10000 0.374 942
## 5 DuckDuck Travel & Tourism 1000-10000 0.369 1850
## 6 Yahoo Real Estate 500-1000 0.364 915
## 7 DuckDuck Travel & Tourism 500-1000 0.364 1752
## 8 Bing Travel & Tourism 1000-10000 0.359 1853
## 9 Yahoo Real Estate 1000-10000 0.357 942
## 10 DuckDuck Travel & Tourism 10000-100000 0.356 1688
## 11 Bing Travel & Tourism 500-1000 0.346 1755
## 12 DuckDuck Dining & Nightlife 500-1000 0.344 754
## 13 DuckDuck Dining & Nightlife 1000-10000 0.342 866
## 14 Bing Travel & Tourism 10000-100000 0.340 1692
## 15 DuckDuck Food & Groceries 500-1000 0.338 1109
## 16 DuckDuck Food & Groceries 1000-10000 0.338 1308
## 17 DuckDuck Health 10000-100000 0.337 1624
## 18 Yahoo Travel & Tourism 1000-10000 0.335 1849
## 19 DuckDuck Jobs & Education 1000-10000 0.335 1589
## 20 Yahoo Travel & Tourism 500-1000 0.330 1756
## 21 Bing Dining & Nightlife 500-1000 0.329 755
## 22 DuckDuck Jobs & Education 500-1000 0.329 1330
## 23 Bing Dining & Nightlife 1000-10000 0.329 869
## 24 DuckDuck Real Estate 10000-100000 0.328 625
## 25 DuckDuck Food & Groceries 10000-100000 0.327 1322
## 26 Bing Health 10000-100000 0.327 1624
## 27 Bing Food & Groceries 500-1000 0.327 1108
## 28 Bing Food & Groceries 1000-10000 0.326 1312
## 29 DuckDuck Health 1000-10000 0.324 1669
## 30 Yahoo Travel & Tourism 10000-100000 0.323 1693
## 31 DuckDuck Vehicles 1000-10000 0.323 1116
## 32 DuckDuck Finance 500-1000 0.322 624
## 33 Bing Jobs & Education 1000-10000 0.321 1592
## 34 DuckDuck News, Media & Publicati~ 1000-10000 0.320 5628
## 35 Yahoo Dining & Nightlife 1000-10000 0.319 867
## 36 DuckDuck News, Media & Publicati~ 10000-100000 0.319 6559
## 37 DuckDuck Retailers & General Mer~ 1000-10000 0.319 754
## 38 DuckDuck Jobs & Education 10000-100000 0.319 1564
## 39 DuckDuck Family & Community 10000-100000 0.319 1634
## 40 DuckDuck Family & Community 1000-10000 0.319 1798
## 41 DuckDuck Hobbies & Leisure 500-1000 0.318 2916
## 42 DuckDuck Finance 10000-100000 0.318 883
## 43 DuckDuck Law & Government 500-1000 0.317 840
## 44 Bing Food & Groceries 10000-100000 0.317 1331
## 45 DuckDuck News, Media & Publicati~ 500-1000 0.316 4166
## 46 DuckDuck Family & Community 500-1000 0.315 1480
## 47 Bing Real Estate 10000-100000 0.315 625
## 48 DuckDuck Retailers & General Mer~ 500-1000 0.315 603
## 49 DuckDuck Health 500-1000 0.315 1495
## 50 Bing Jobs & Education 500-1000 0.314 1331
## 51 DuckDuck Hobbies & Leisure 1000-10000 0.314 3577
## 52 DuckDuck Hobbies & Leisure 10000-100000 0.313 3781
## 53 Bing News, Media & Publicati~ 1000-10000 0.313 5632
## 54 DuckDuck Retailers & General Mer~ 10000-100000 0.313 699
## 55 DuckDuck Finance 1000-10000 0.312 764
## 56 Yahoo Jobs & Education 1000-10000 0.312 1593
## 57 DuckDuck Business & Industrial 10000-100000 0.311 4359
## 58 Bing News, Media & Publicati~ 10000-100000 0.311 6565
## 59 Yahoo Dining & Nightlife 500-1000 0.311 756
## 60 DuckDuck Law & Government 1000-10000 0.310 1014
## 61 DuckDuck Business & Industrial 1000-10000 0.310 4345
## 62 DuckDuck Dining & Nightlife 10000-100000 0.310 705
## 63 Bing Health 1000-10000 0.310 1673
## 64 Bing News, Media & Publicati~ 500-1000 0.309 4172
## 65 DuckDuck Occasions & Gifts 500-1000 0.309 644
## 66 Bing Jobs & Education 10000-100000 0.309 1571
## 67 Bing Finance 10000-100000 0.309 887
## 68 Yahoo Food & Groceries 1000-10000 0.308 1314
## 69 DuckDuck Arts & Entertainment 1000-10000 0.308 5160
## 70 DuckDuck Occasions & Gifts 1000-10000 0.308 677
## 71 Bing Hobbies & Leisure 500-1000 0.308 2920
## 72 DuckDuck Computers & Consumer El~ 1000-10000 0.306 2124
## 73 Bing Retailers & General Mer~ 1000-10000 0.306 755
## 74 Bing Family & Community 10000-100000 0.306 1637
## 75 DuckDuck Apparel 10000-100000 0.306 1486
## 76 Bing Finance 1000-10000 0.306 766
## 77 DuckDuck Business & Industrial 500-1000 0.306 3682
## 78 Bing Law & Government 1000-10000 0.306 1017
## 79 Bing Hobbies & Leisure 1000-10000 0.306 3583
## 80 Bing Health 500-1000 0.305 1498
## 81 Bing Retailers & General Mer~ 10000-100000 0.305 699
## 82 Yahoo Real Estate 10000-100000 0.305 625
## 83 DuckDuck Sports & Fitness 1000-10000 0.305 1741
## 84 DuckDuck Arts & Entertainment 10000-100000 0.305 5729
## 85 Bing Vehicles 1000-10000 0.305 1115
## 86 DuckDuck Law & Government 10000-100000 0.305 1022
## 87 Bing Finance 500-1000 0.305 628
## 88 Bing Family & Community 500-1000 0.304 1484
## 89 DuckDuck Arts & Entertainment 500-1000 0.304 4040
## 90 DuckDuck Home & Garden 10000-100000 0.304 1934
## 91 DuckDuck Home & Garden 1000-10000 0.304 2092
## 92 Yahoo Food & Groceries 500-1000 0.304 1110
## 93 DuckDuck Vehicles 500-1000 0.303 1086
## 94 DuckDuck Vehicles 10000-100000 0.303 930
## 95 Bing Hobbies & Leisure 10000-100000 0.303 3783
## 96 Bing Occasions & Gifts 1000-10000 0.303 679
## 97 Bing Family & Community 1000-10000 0.303 1797
## 98 Yahoo Finance 10000-100000 0.302 887
## 99 Bing Retailers & General Mer~ 500-1000 0.302 605
## 100 DuckDuck Occasions & Gifts 10000-100000 0.302 613
## 101 Bing Law & Government 500-1000 0.301 846
## 102 Bing Business & Industrial 10000-100000 0.301 4364
## 103 Bing Business & Industrial 1000-10000 0.301 4350
## 104 DuckDuck Beauty & Personal Care 10000-100000 0.301 1118
## 105 Bing Arts & Entertainment 1000-10000 0.301 5164
## 106 Yahoo Food & Groceries 10000-100000 0.301 1331
## 107 Yahoo Jobs & Education 500-1000 0.300 1330
## 108 Yahoo Retailers & General Mer~ 10000-100000 0.299 699
## 109 DuckDuck Sports & Fitness 10000-100000 0.298 1685
## 110 Yahoo News, Media & Publicati~ 10000-100000 0.298 6569
## 111 Yahoo Dining & Nightlife 10000-100000 0.298 707
## 112 DuckDuck Sports & Fitness 500-1000 0.298 1491
## 113 DuckDuck Internet & Telecom 1000-10000 0.298 1681
## 114 Bing Home & Garden 1000-10000 0.297 2092
## 115 DuckDuck Computers & Consumer El~ 500-1000 0.297 1695
## 116 Bing Computers & Consumer El~ 1000-10000 0.297 2127
## 117 DuckDuck Computers & Consumer El~ 10000-100000 0.297 2351
## 118 Bing Arts & Entertainment 10000-100000 0.297 5730
## 119 Yahoo Health 10000-100000 0.297 1624
## 120 Yahoo News, Media & Publicati~ 1000-10000 0.296 5630
## 121 Bing Business & Industrial 500-1000 0.296 3681
## 122 Bing Arts & Entertainment 500-1000 0.296 4047
## 123 Bing Occasions & Gifts 500-1000 0.296 643
## 124 Bing Home & Garden 10000-100000 0.296 1940
## 125 DuckDuck Home & Garden 500-1000 0.296 1811
## 126 Yahoo Jobs & Education 10000-100000 0.295 1569
## 127 Yahoo Vehicles 1000-10000 0.295 1117
## 128 Yahoo Finance 500-1000 0.295 628
## 129 DuckDuck Beauty & Personal Care 1000-10000 0.294 1054
## 130 Bing Vehicles 500-1000 0.294 1086
## 131 Yahoo Retailers & General Mer~ 500-1000 0.294 605
## 132 Yahoo News, Media & Publicati~ 500-1000 0.294 4171
## 133 Bing Dining & Nightlife 10000-100000 0.293 706
## 134 Bing Sports & Fitness 1000-10000 0.293 1739
## 135 Bing Law & Government 10000-100000 0.292 1025
## 136 Yahoo Law & Government 1000-10000 0.292 1014
## 137 Bing Occasions & Gifts 10000-100000 0.292 616
## 138 Yahoo Law & Government 500-1000 0.291 845
## 139 Yahoo Family & Community 10000-100000 0.291 1635
## 140 Yahoo Finance 1000-10000 0.291 765
## 141 Bing Computers & Consumer El~ 10000-100000 0.291 2356
## 142 DuckDuck Internet & Telecom 500-1000 0.291 1277
## 143 Bing Apparel 10000-100000 0.290 1480
## 144 Yahoo Family & Community 1000-10000 0.290 1800
## 145 Bing Internet & Telecom 1000-10000 0.290 1684
## 146 Yahoo Retailers & General Mer~ 1000-10000 0.290 755
## 147 Bing Home & Garden 500-1000 0.289 1811
## 148 Bing Sports & Fitness 500-1000 0.289 1497
## 149 Bing Vehicles 10000-100000 0.289 934
## 150 Bing Computers & Consumer El~ 500-1000 0.289 1697
## 151 Yahoo Hobbies & Leisure 1000-10000 0.288 3583
## 152 Yahoo Family & Community 500-1000 0.288 1484
## 153 DuckDuck Internet & Telecom 10000-100000 0.288 1952
## 154 Yahoo Health 1000-10000 0.287 1670
## 155 Bing Sports & Fitness 10000-100000 0.286 1686
## 156 Yahoo Hobbies & Leisure 10000-100000 0.286 3783
## 157 Yahoo Business & Industrial 10000-100000 0.286 4363
## 158 Yahoo Law & Government 10000-100000 0.285 1025
## 159 Yahoo Health 500-1000 0.285 1498
## 160 DuckDuck Apparel 1000-10000 0.285 1405
## 161 Yahoo Hobbies & Leisure 500-1000 0.285 2922
## 162 Bing Beauty & Personal Care 10000-100000 0.284 1120
## 163 Bing Internet & Telecom 10000-100000 0.284 1952
## 164 Bing Beauty & Personal Care 1000-10000 0.284 1052
## 165 Yahoo Arts & Entertainment 1000-10000 0.283 5156
## 166 Yahoo Occasions & Gifts 1000-10000 0.282 679
## 167 Yahoo Vehicles 500-1000 0.282 1085
## 168 Yahoo Business & Industrial 1000-10000 0.281 4350
## 169 DuckDuck Beauty & Personal Care 500-1000 0.281 886
## 170 Yahoo Arts & Entertainment 500-1000 0.280 4045
## 171 Yahoo Sports & Fitness 1000-10000 0.280 1743
## 172 Yahoo Arts & Entertainment 10000-100000 0.280 5731
## 173 Bing Internet & Telecom 500-1000 0.279 1282
## 174 Yahoo Computers & Consumer El~ 1000-10000 0.279 2125
## 175 Yahoo Business & Industrial 500-1000 0.279 3685
## 176 Bing Beauty & Personal Care 500-1000 0.279 888
## 177 Yahoo Vehicles 10000-100000 0.278 933
## 178 Yahoo Sports & Fitness 500-1000 0.276 1495
## 179 Yahoo Internet & Telecom 1000-10000 0.276 1683
## 180 Yahoo Occasions & Gifts 500-1000 0.276 644
## 181 Bing Apparel 1000-10000 0.276 1404
## 182 Yahoo Computers & Consumer El~ 10000-100000 0.274 2355
## 183 Yahoo Occasions & Gifts 10000-100000 0.272 617
## 184 DuckDuck Apparel 500-1000 0.272 1171
## 185 Yahoo Home & Garden 10000-100000 0.272 1941
## 186 Yahoo Home & Garden 1000-10000 0.272 2092
## 187 Yahoo Internet & Telecom 10000-100000 0.271 1955
## 188 Yahoo Sports & Fitness 10000-100000 0.271 1685
## 189 Yahoo Computers & Consumer El~ 500-1000 0.270 1698
## 190 Yahoo Home & Garden 500-1000 0.267 1811
## 191 Bing Apparel 500-1000 0.266 1172
## 192 Yahoo Apparel 10000-100000 0.266 1483
## 193 Yahoo Beauty & Personal Care 10000-100000 0.264 1119
## 194 Yahoo Beauty & Personal Care 1000-10000 0.264 1051
## 195 Yahoo Internet & Telecom 500-1000 0.264 1285
## 196 Yahoo Beauty & Personal Care 500-1000 0.254 888
## 197 Yahoo Apparel 1000-10000 0.249 1404
## 198 Yahoo Apparel 500-1000 0.245 1173
p <-
similarity_by_se_category_and_vol %>%
mutate(
monthly_search_volume_level = factor(monthly_search_volume_level, labels = c("0.5k-1k", "1k-10k", "10k-1m")),
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine)
) %>%
ggplot(aes(x = similarity, y = keyword_category,
group = search_engine)) +
geom_point(aes(fill = search_engine), shape = 21, size = 5, stroke = 1.1, color = "grey95") +
scale_x_continuous(limits = c(0.2, 0.4), labels = scales::label_percent(accuracy = 1), expand = expansion(0)) +
scale_color_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
scale_fill_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0"), limits = c("Yahoo", "Bing", "DuckDuckGo")) +
coord_cartesian(clip = "off") +
facet_wrap(~ monthly_search_volume_level) +
guides(
fill = guide_legend(
title.position = "top",
title.hjust = 0.5,
label.theme = element_text(size = 9, face = "bold", color = "grey30"),
nrow = 1
)
) +
labs(
title = "Search Engine Similarity to Google by Keyword Category",
subtitle = "Based on Top 10 rankings",
colour = "Search Engine",
x = "Search Engine Similarity",
y = "",
fill = "Search Engine"
) +
theme(
plot.margin = margin(30, 20, 10, 20),
plot.subtitle = element_text(size = rel(1.1), face = "bold", color = "grey40"),
legend.position = c(0.86, 1.08),
legend.title = element_text(size = rel(0.8), face = "bold", color = "grey60"),
strip.placement = "outside"
)
ragg::agg_png(here::here("plots", "plot_08_plot_google_similarity_by_vol.png"), width = 12, height = 9, units = "in", res = 320)
print(p)
dev.off()
## png
## 2
Below we show, both for small and medium volume (resp. 500-1000 and 1000-10000), 5 examples of Real Estate keywords for which we found the same url on the top for every search engine.
## # A tibble: 5 x 2
## keyword url
## <chr> <chr>
## 1 family worship center lakelan~ https://www.fwclakeland.org/
## 2 heritage at millenia https://www.heritagemillenia.com/
## 3 camp walden, ny https://www.campwalden-ny.com/
## 4 colorado springs low income h~ https://www.apartments.com/colorado-springs-co~
## 5 manatee size https://en.wikipedia.org/wiki/Manatee
## # A tibble: 5 x 2
## keyword url
## <chr> <chr>
## 1 huffines chevrolet lewis~ https://www.huffineschevylewisville.com/
## 2 pleasant grove utah https://www.plgrove.org/
## 3 academy lubbock https://www.academy.com/shop/storelocator/texas/lub~
## 4 the rack house https://www.therackhousekww.com/
## 5 atharvaa https://en.wikipedia.org/wiki/Atharvaa
Most keywords (search terms) have between 2 and 4 words. There are also 29 search terms that have at least 10 words. For example what are the first ten amendments to the constitution called
and how long does it take to become a pediatric nurse
.
p <-
kw_length_counts %>%
ggplot(aes(x = keyword_length, y = n)) +
geom_col(fill = "#807FFF", alpha = 0.8) +
geom_col(
data = filter(kw_length_counts, keyword_length %in% 2:4),
fill = colorspace::darken("#807FFF", 0.25),
alpha = 0.75
) +
scale_x_continuous(breaks = seq(0, 10, 1)) +
scale_y_continuous(limits = c(0, 20000), labels = scales::label_comma()) +
labs(title = "Keywords (Search terms) by Number of Words",
y = "Number of Keywords",
x = "Number of Words in Keyword") +
ragg::agg_png(here::here("plots", "plot_09_plot_number_of_words.png"), width = 9, height = 6, units = "in", res = 320)
print(p)
dev.off()
For all three search engines, the results get more similar to Google’s on average as keywords get longer.
On average accuracy for length 1 we measure a similarity in the 28-30% range, while for keywords made of more than 5 terms we get an average similarity in the 31.5-35% range).
It might be that long keywords are more precise, so the pool of potential targets decreases with the length of the search and long searches tend to converge over search engines too.
similarity_by_se_and_kw_length <-
similarity_by_se_and_kw_length %>%
mutate(
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine)
)
print(similarity_by_se_and_kw_length, n = Inf)
## # A tibble: 15 x 4
## search_engine keyword_length similarity n
## <chr> <chr> <dbl> <int>
## 1 DuckDuckGo 5+ 0.349 5869
## 2 Bing 5+ 0.340 5836
## 3 DuckDuckGo 4 0.331 10028
## 4 Bing 4 0.324 9931
## 5 DuckDuckGo 3 0.323 18713
## 6 Yahoo 5+ 0.315 5830
## 7 Bing 3 0.314 18563
## 8 DuckDuckGo 2 0.305 18136
## 9 DuckDuckGo 1 0.300 3964
## 10 Yahoo 4 0.299 9952
## 11 Yahoo 3 0.293 18588
## 12 Bing 2 0.292 18000
## 13 Bing 1 0.286 3926
## 14 Yahoo 1 0.282 3933
## 15 Yahoo 2 0.280 17985
p <-
similarity_by_se_and_kw_length %>%
ggplot(aes(keyword_length, y = similarity, colour = search_engine, group = search_engine)) +
geom_line() +
geom_point(aes(fill = search_engine), shape = 21, size = 4, color = "white", stroke = 0.6) +
ggrepel::geom_text_repel(data = filter(similarity_by_se_and_kw_length, keyword_length == "5+"),
aes(label = search_engine), size = 3, hjust = 0, nudge_y = 0.005, fontface = "bold", family = "Poppins") +
scale_y_continuous(limits = c(0.25, 0.40), breaks = seq(0.25, 0.40, 0.05), labels = scales::label_percent(accuracy = 1), expand = expansion(0)) +
scale_x_discrete(expand = expansion(0.01)) +
scale_color_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
scale_fill_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
labs(title = "Searching long-tail keywords? Stick to your search engine of choice",
subtitle = "Based on Top 10 rankings",
colour = "Search Engine",
x = "Keyword Length in Words",
y = "Search Engine Similarity") +
coord_cartesian(clip = "off") +
theme(
plot.title = element_text(face = "bold", margin = margin(b = 5)),
plot.subtitle = element_text(face = "bold", color = "grey40", margin = margin(b = 15))
)
ragg::agg_png(here::here("plots", "plot_10_plot_similarity_keyword_length.png"), width = 7, height = 4.5, units = "in", res = 320)
print(p)
dev.off()
## png
## 2
Breaking it down by volume level, we observe the same tendency but with a stronger trend for smaller volume (generally smaller minima and higher maxima). It might be that for low volume searches, a high number of terms make the search very precise and given that the pool of results is restricted the rankings will tend to converge, while a short search for low volume keywords is not precise enough to converge over search engines.
similarity_by_se_vol_and_kw_length <-
similarity_by_se_vol_and_kw_length %>%
mutate(
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine)
)
print(similarity_by_se_vol_and_kw_length, n = Inf)
## # A tibble: 45 x 5
## search_engine monthly_search_volume_level keyword_length similarity n
## <chr> <fct> <chr> <dbl> <int>
## 1 DuckDuckGo 500-1000 5+ 0.344 4850
## 2 DuckDuckGo 1000-10000 5+ 0.342 4516
## 3 Bing 500-1000 5+ 0.337 4861
## 4 DuckDuckGo 10000-100000 5+ 0.335 2477
## 5 Bing 1000-10000 5+ 0.334 4505
## 6 DuckDuckGo 1000-10000 4 0.331 7775
## 7 DuckDuckGo 10000-100000 4 0.326 5365
## 8 Bing 1000-10000 4 0.326 7788
## 9 DuckDuckGo 500-1000 4 0.323 7517
## 10 Bing 10000-100000 5+ 0.323 2483
## 11 Bing 10000-100000 4 0.319 5372
## 12 DuckDuckGo 10000-100000 3 0.319 12765
## 13 DuckDuckGo 1000-10000 3 0.319 14931
## 14 Bing 500-1000 4 0.316 7543
## 15 DuckDuckGo 500-1000 3 0.315 12176
## 16 DuckDuckGo 10000-100000 1 0.313 6106
## 17 Yahoo 1000-10000 5+ 0.312 4509
## 18 Bing 10000-100000 3 0.311 12785
## 19 Yahoo 500-1000 5+ 0.310 4864
## 20 Bing 1000-10000 3 0.310 14947
## 21 Bing 500-1000 3 0.307 12183
## 22 DuckDuckGo 1000-10000 2 0.305 13790
## 23 Yahoo 10000-100000 5+ 0.302 2487
## 24 Yahoo 1000-10000 4 0.301 7789
## 25 DuckDuckGo 10000-100000 2 0.300 17550
## 26 Bing 10000-100000 1 0.299 6121
## 27 Yahoo 10000-100000 4 0.296 5380
## 28 DuckDuckGo 500-1000 2 0.295 9979
## 29 Yahoo 10000-100000 1 0.295 6109
## 30 Bing 1000-10000 2 0.293 13811
## 31 Yahoo 500-1000 4 0.292 7540
## 32 Yahoo 10000-100000 3 0.291 12791
## 33 Bing 10000-100000 2 0.289 17562
## 34 Yahoo 1000-10000 3 0.289 14941
## 35 Yahoo 500-1000 3 0.289 12188
## 36 Yahoo 1000-10000 2 0.282 13799
## 37 Bing 500-1000 2 0.281 9987
## 38 DuckDuckGo 1000-10000 1 0.279 2140
## 39 Yahoo 10000-100000 2 0.275 17562
## 40 Yahoo 500-1000 2 0.272 9996
## 41 DuckDuckGo 500-1000 1 0.270 1241
## 42 Bing 1000-10000 1 0.264 2146
## 43 Yahoo 1000-10000 1 0.258 2144
## 44 Bing 500-1000 1 0.252 1247
## 45 Yahoo 500-1000 1 0.250 1241
se_labels <-
similarity_by_se_vol_and_kw_length %>%
mutate(
monthly_search_volume_level = factor(monthly_search_volume_level, labels = c("0.5k-1k", "1k-10k", "10k-1m")),
) %>%
filter(monthly_search_volume_level == "0.5k-1k", keyword_length == "5+")
p <-
similarity_by_se_vol_and_kw_length %>%
mutate(
monthly_search_volume_level = factor(monthly_search_volume_level, labels = c("0.5k-1k", "1k-10k", "10k-1m"))
) %>%
ggplot(aes(keyword_length, y = similarity, colour = search_engine, group = search_engine)) +
geom_line() +
geom_point(aes(fill = search_engine), shape = 21, size = 4, color = "white", stroke = 0.6) +
ggrepel::geom_text_repel(
data = se_labels,
aes(label = search_engine), size = 3, hjust = 0, nudge_y = 0.005, nudge_x = c(-1.3, -0.7, -0.8), fontface = "bold", family = "Poppins"
) +
scale_y_continuous(limits = c(0.25, 0.4), breaks = seq(0.25, 0.4, 0.05), labels = scales::label_percent(accuracy = 1), expand = expansion(0)) +
scale_x_discrete(expand = expansion(0.01)) +
scale_color_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
scale_fill_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
labs(title = "Search Engine similarity to Google by Keyword Length",
subtitle = "Top 10 - breakdown by volume level",
colour = "Search Engine",
x = "Keyword Length in Words",
y = "Search Engine Similarity") +
facet_wrap(~monthly_search_volume_level, ncol=3) +
coord_cartesian(clip = "off") +
theme(
plot.title = element_text(face = "bold", margin = margin(b = 5)),
plot.subtitle = element_text(face = "bold", color = "grey40", margin = margin(b = 15))
)
ragg::agg_png(here::here("plots", "plot_11_plot_similarity_vol_keyword_length.png"), width = 9, height = 4.5, units = "in", res = 320)
print(p)
dev.off()
## png
## 2
Below we show, both for single word searches and long searches, 5 examples for which we found the same url on the top for every search engine.
We see that the single word searches here are either brands, or products, leading to an official company webpage, or the definition of a technical word.
Long searches are more specific and tend to yield result that include the search, barely modified, in their address.
## # A tibble: 5 x 3
## keyword domain url
## <chr> <chr> <chr>
## 1 resurfx lumenis.com https://lumenis.com/aesthetics/products/m22/r~
## 2 kribensis www.thesprucepets.com https://www.thesprucepets.com/kribensis-13782~
## 3 besetting www.merriam-webster.~ https://www.merriam-webster.com/dictionary/be~
## 4 piz-zetta piz-zetta.com http://piz-zetta.com/
## 5 fsbo fsbo.com https://fsbo.com/
## # A tibble: 5 x 3
## keyword domain url
## <chr> <chr> <chr>
## 1 edwards funeral home arcola~ www.edwardsfh~ https://www.edwardsfh.net/
## 2 difference between anxiety ~ www.healthlin~ https://www.healthline.com/health~
## 3 what is mochi made of en.wikipedia.~ https://en.wikipedia.org/wiki/Moc~
## 4 how to make a rag quilt www.thespruce~ https://www.thesprucecrafts.com/l~
## 5 yes we have no bananas en.wikipedia.~ https://en.wikipedia.org/wiki/Yes~
We take a look at the top 10 domains featured in 1st position of Google searches and see for which proportion of keyword they come up on top.
We see that search engines vary sensibly in which domain they put on top.
top_domains <-
top_domains %>%
mutate(
search_engine = fct_recode(search_engine, DuckDuckGo = "DuckDuck")
)
print(top_domains, n = Inf)
## # A tibble: 40 x 4
## search_engine domain n pct
## <fct> <fct> <int> <dbl>
## 1 Yahoo en.wikipedia.org 5343 0.0954
## 2 Google en.wikipedia.org 4897 0.0875
## 3 DuckDuckGo www.amazon.com 3852 0.0688
## 4 Yahoo www.amazon.com 3712 0.0663
## 5 DuckDuckGo en.wikipedia.org 3707 0.0662
## 6 Bing en.wikipedia.org 3652 0.0652
## 7 Bing www.amazon.com 3492 0.0624
## 8 Google www.amazon.com 3101 0.0554
## 9 Google www.youtube.com 1321 0.0236
## 10 DuckDuckGo www.youtube.com 1260 0.0225
## 11 DuckDuckGo www.imdb.com 1166 0.0208
## 12 DuckDuckGo www.merriam-webster.com 1137 0.0203
## 13 Bing www.merriam-webster.com 1136 0.0203
## 14 Yahoo www.youtube.com 1129 0.0202
## 15 Bing www.imdb.com 1124 0.0201
## 16 Yahoo www.merriam-webster.com 1121 0.0200
## 17 Bing www.youtube.com 1116 0.0199
## 18 Yahoo www.imdb.com 997 0.0178
## 19 Google www.merriam-webster.com 932 0.0166
## 20 Yahoo www.tripadvisor.com 796 0.0142
## 21 Bing www.tripadvisor.com 768 0.0137
## 22 DuckDuckGo www.tripadvisor.com 757 0.0135
## 23 Google www.homedepot.com 657 0.0117
## 24 DuckDuckGo www.homedepot.com 566 0.0101
## 25 Bing www.homedepot.com 563 0.0101
## 26 Yahoo www.homedepot.com 559 0.00999
## 27 Google www.pinterest.com 542 0.00968
## 28 Google www.tripadvisor.com 514 0.00918
## 29 Bing www.healthline.com 483 0.00863
## 30 Google weather.com 464 0.00829
## 31 DuckDuckGo www.healthline.com 454 0.00811
## 32 Google www.imdb.com 446 0.00797
## 33 Google www.healthline.com 430 0.00768
## 34 Yahoo www.healthline.com 420 0.00750
## 35 Yahoo www.pinterest.com 400 0.00715
## 36 DuckDuckGo www.pinterest.com 363 0.00648
## 37 Bing www.pinterest.com 354 0.00632
## 38 DuckDuckGo weather.com 236 0.00422
## 39 Yahoo weather.com 205 0.00366
## 40 Bing weather.com 200 0.00357
compare_to_google <-
top_domains %>%
group_by(domain) %>%
arrange(domain, search_engine) %>%
mutate(
google_pct = if_else(search_engine == "Google", pct, NA_real_)
) %>%
fill(google_pct, .direction = "down") %>%
ungroup() %>%
filter(search_engine != "Google")
p <-
ggplot() +
geom_segment(
data = compare_to_google,
aes(x = pct, xend = google_pct, y = search_engine, yend = search_engine, color = search_engine),
size = 1.1, show.legend = FALSE
) +
geom_vline(
data = filter(top_domains, search_engine == "Google"),
aes(xintercept = pct), size = 1.1, color = "#574c99"
) +
geom_point(data = filter(top_domains, search_engine != "Google"),
aes(pct, search_engine, fill = search_engine), shape = 21, size = 7, color = "white", stroke = 1) +
geom_text(data = filter(top_domains, search_engine == "Google", domain == "en.wikipedia.org"),
aes(x = pct, y = 1, label = "Google"), nudge_x = 0.001, hjust = 0, vjust = 1, angle = -270, fontface = "bold", family = "Poppins", size = 5, color = "#574c99", alpha = 0.8) +
labs(
title = "Every search engine has its favorite “big domains",
subtitle = "Based on Top Result",
x = "Share of presence in top spot",
y = "",
fill = "Search Engine"
) +
facet_wrap(~domain, ncol = 2, strip.position = "top", scales = "free_x") +
scale_x_continuous(limits = c(0, 0.1), breaks = seq(0, 0.1, 0.025), labels = scales::label_percent(accuracy = 0.1), expand = expansion(0)) +
scale_fill_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0"), limits = c("Yahoo", "Bing", "DuckDuckGo")) +
scale_color_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
coord_cartesian(clip = "off") +
guides(
fill = guide_legend(
title.position = "top",
title.hjust = 0.5,
label.theme = element_text(size = 9, face = "bold", color = "grey30"),
nrow = 1,
override.aes = list(size = 3, stroke = 0.6)
)
) +
theme(
plot.title = element_text(face = "bold", margin = margin(b = 5)),
plot.subtitle = element_text(face = "bold", color = "grey40", margin = margin(b = 15)),
axis.text.y = element_blank(),
panel.border = element_rect(size = 0.6, color = "white", fill = NA),
panel.spacing.y = unit(1, "lines"),
panel.spacing.x = unit(3, "lines"),
legend.position = c(0.85, 1.07),
legend.title = element_text(size = rel(0.8), face = "bold", color = "grey60"),
legend.spacing.x = unit(0.5, "mm")
)
ragg::agg_png(here::here("plots", "plot_12_plot_top_domains_top_result.png"), width = 9, height = 12, units = "in", res = 320)
print(p)
dev.off()
## png
## 2
We reproduce the analysis considering the whole top 10 results rather than only the top result.
top_domains_10 <-
top_domains_10 %>%
mutate(
search_engine = fct_recode(search_engine, DuckDuckGo = "DuckDuck")
)
print(top_domains_10, n = Inf)
## # A tibble: 40 x 4
## search_engine domain n pct
## <fct> <fct> <int> <dbl>
## 1 Yahoo en.wikipedia.org 21854 0.0390
## 2 DuckDuckGo www.amazon.com 21726 0.0388
## 3 Bing www.amazon.com 19439 0.0347
## 4 Google en.wikipedia.org 18995 0.0339
## 5 Yahoo www.amazon.com 18868 0.0337
## 6 Bing en.wikipedia.org 18313 0.0327
## 7 DuckDuckGo en.wikipedia.org 17867 0.0319
## 8 Google www.amazon.com 16836 0.0301
## 9 DuckDuckGo www.youtube.com 12591 0.0225
## 10 Yahoo www.youtube.com 12078 0.0216
## 11 Bing www.youtube.com 11680 0.0209
## 12 Google www.youtube.com 9978 0.0178
## 13 Google www.facebook.com 9017 0.0161
## 14 Google www.pinterest.com 8909 0.0159
## 15 Bing www.tripadvisor.com 5505 0.00983
## 16 DuckDuckGo www.tripadvisor.com 5474 0.00978
## 17 Google www.walmart.com 5274 0.00942
## 18 Yahoo www.tripadvisor.com 5258 0.00939
## 19 Google www.yelp.com 5134 0.00917
## 20 DuckDuckGo www.yelp.com 4858 0.00868
## 21 DuckDuckGo www.imdb.com 4639 0.00829
## 22 Bing www.yelp.com 4614 0.00824
## 23 Bing www.facebook.com 4571 0.00817
## 24 DuckDuckGo www.facebook.com 4443 0.00794
## 25 Bing www.imdb.com 4442 0.00794
## 26 Yahoo www.imdb.com 4421 0.00790
## 27 Bing www.ebay.com 4326 0.00773
## 28 Bing www.walmart.com 4209 0.00752
## 29 Google www.tripadvisor.com 4171 0.00745
## 30 Yahoo www.facebook.com 4151 0.00742
## 31 DuckDuckGo www.walmart.com 4111 0.00734
## 32 Yahoo www.walmart.com 3872 0.00692
## 33 Yahoo www.ebay.com 3810 0.00681
## 34 Yahoo www.yelp.com 3648 0.00652
## 35 Google www.imdb.com 3478 0.00621
## 36 Google www.ebay.com 3377 0.00603
## 37 Yahoo www.pinterest.com 3163 0.00565
## 38 Bing www.pinterest.com 3044 0.00544
## 39 DuckDuckGo www.pinterest.com 2939 0.00525
## 40 DuckDuckGo www.ebay.com 8 0.0000143
compare_to_google <-
top_domains_10 %>%
group_by(domain) %>%
arrange(domain, search_engine) %>%
mutate(google_pct = if_else(search_engine == "Google", pct, NA_real_)) %>%
fill(google_pct, .direction = "down") %>%
ungroup() %>%
filter(search_engine != "Google")
print(compare_to_google, n = Inf)
## # A tibble: 30 x 5
## search_engine domain n pct google_pct
## <fct> <fct> <int> <dbl> <dbl>
## 1 DuckDuckGo en.wikipedia.org 17867 0.0319 0.0339
## 2 Bing en.wikipedia.org 18313 0.0327 0.0339
## 3 Yahoo en.wikipedia.org 21854 0.0390 0.0339
## 4 DuckDuckGo www.amazon.com 21726 0.0388 0.0301
## 5 Bing www.amazon.com 19439 0.0347 0.0301
## 6 Yahoo www.amazon.com 18868 0.0337 0.0301
## 7 DuckDuckGo www.youtube.com 12591 0.0225 0.0178
## 8 Bing www.youtube.com 11680 0.0209 0.0178
## 9 Yahoo www.youtube.com 12078 0.0216 0.0178
## 10 DuckDuckGo www.facebook.com 4443 0.00794 0.0161
## 11 Bing www.facebook.com 4571 0.00817 0.0161
## 12 Yahoo www.facebook.com 4151 0.00742 0.0161
## 13 DuckDuckGo www.pinterest.com 2939 0.00525 0.0159
## 14 Bing www.pinterest.com 3044 0.00544 0.0159
## 15 Yahoo www.pinterest.com 3163 0.00565 0.0159
## 16 DuckDuckGo www.walmart.com 4111 0.00734 0.00942
## 17 Bing www.walmart.com 4209 0.00752 0.00942
## 18 Yahoo www.walmart.com 3872 0.00692 0.00942
## 19 DuckDuckGo www.yelp.com 4858 0.00868 0.00917
## 20 Bing www.yelp.com 4614 0.00824 0.00917
## 21 Yahoo www.yelp.com 3648 0.00652 0.00917
## 22 DuckDuckGo www.tripadvisor.com 5474 0.00978 0.00745
## 23 Bing www.tripadvisor.com 5505 0.00983 0.00745
## 24 Yahoo www.tripadvisor.com 5258 0.00939 0.00745
## 25 DuckDuckGo www.imdb.com 4639 0.00829 0.00621
## 26 Bing www.imdb.com 4442 0.00794 0.00621
## 27 Yahoo www.imdb.com 4421 0.00790 0.00621
## 28 DuckDuckGo www.ebay.com 8 0.0000143 0.00603
## 29 Bing www.ebay.com 4326 0.00773 0.00603
## 30 Yahoo www.ebay.com 3810 0.00681 0.00603
p <-
ggplot() +
geom_segment(
data = compare_to_google,
aes(x = pct, xend = google_pct, y = search_engine, yend = search_engine, color = search_engine),
size = 1.1, show.legend = FALSE
) +
geom_vline(
data = filter(top_domains_10, search_engine == "Google"),
aes(xintercept = pct), size = 1.1, color = "#574c99"
) +
geom_point(data = filter(top_domains_10, search_engine != "Google"),
aes(pct, search_engine, fill = search_engine), shape = 21, size = 7, color = "white", stroke = 1) +
geom_text(data = filter(top_domains_10, search_engine == "Google", domain == "en.wikipedia.org"),
aes(x = pct, y = 1, label = "Google"), nudge_x = 0.001, hjust = 0, vjust = 1, angle = -270, fontface = "bold", family = "Poppins", size = 5, color = "#574c99", alpha = 0.8) +
labs(
title = "Prevalence of big domains in searches",
subtitle = "Based on Top 10 Results",
x = "Share of presence in top spot",
y = "",
fill = "Search Engine"
) +
facet_wrap(~domain, ncol = 2, strip.position = "top", scales = "free_x") +
scale_x_continuous(limits = c(0, 0.05), breaks = seq(0, 0.05, 0.01), labels = scales::label_percent(accuracy = 0.1), expand = expansion(0)) +
scale_fill_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0"), limits = c("Yahoo", "Bing", "DuckDuckGo")) +
scale_color_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
coord_cartesian(clip = "off") +
guides(
fill = guide_legend(
title.position = "top",
title.hjust = 0.5,
label.theme = element_text(size = 9, face = "bold", color = "grey30"),
nrow = 1,
override.aes = list(size = 3, stroke = 0.6)
)
) +
theme(
plot.title = element_text(face = "bold", margin = margin(b = 5)),
plot.subtitle = element_text(face = "bold", color = "grey40", margin = margin(b = 15)),
axis.text.y = element_blank(),
panel.border = element_rect(size = 0.6, color = "white", fill = NA),
panel.spacing.y = unit(1, "lines"),
panel.spacing.x = unit(3, "lines"),
legend.position = c(0.85, 1.07),
legend.title = element_text(size = rel(0.8), face = "bold", color = "grey60"),
legend.spacing.x = unit(0.5, "mm")
)
ragg::agg_png(here::here("plots", "plot_13_plot_top_domains_top_10_result.png"), width = 9, height = 12, units = "in", res = 320)
print(p)
dev.off()
## png
## 2
Find below the top domains considering top 3, top 20 and top 30
top_domains_3 <-
top_domains_3 %>%
mutate(
search_engine = fct_recode(search_engine, DuckDuckGo = "DuckDuck")
)
print(top_domains_3, n = Inf)
## # A tibble: 40 x 4
## search_engine domain n pct
## <fct> <fct> <int> <dbl>
## 1 Google en.wikipedia.org 11555 0.0688
## 2 Yahoo en.wikipedia.org 11362 0.0677
## 3 DuckDuckGo www.amazon.com 10110 0.0602
## 4 Yahoo www.amazon.com 9702 0.0578
## 5 Bing www.amazon.com 9305 0.0554
## 6 DuckDuckGo en.wikipedia.org 8768 0.0522
## 7 Bing en.wikipedia.org 8690 0.0517
## 8 Google www.amazon.com 8615 0.0513
## 9 Google www.youtube.com 4136 0.0246
## 10 DuckDuckGo www.youtube.com 3737 0.0223
## 11 Yahoo www.youtube.com 3394 0.0202
## 12 Bing www.youtube.com 3236 0.0193
## 13 DuckDuckGo www.imdb.com 2593 0.0154
## 14 Google www.facebook.com 2587 0.0154
## 15 Bing www.imdb.com 2574 0.0153
## 16 Bing www.tripadvisor.com 2503 0.0149
## 17 Yahoo www.imdb.com 2466 0.0147
## 18 Bing www.yelp.com 2415 0.0144
## 19 DuckDuckGo www.yelp.com 2387 0.0142
## 20 Yahoo www.tripadvisor.com 2375 0.0141
## 21 DuckDuckGo www.tripadvisor.com 2373 0.0141
## 22 Yahoo www.merriam-webster.com 2335 0.0139
## 23 Bing www.merriam-webster.com 2321 0.0138
## 24 DuckDuckGo www.merriam-webster.com 2273 0.0135
## 25 Google www.pinterest.com 2193 0.0131
## 26 Google www.merriam-webster.com 2058 0.0123
## 27 Google www.imdb.com 1839 0.0110
## 28 Yahoo www.yelp.com 1714 0.0102
## 29 Google www.tripadvisor.com 1654 0.00985
## 30 Google www.homedepot.com 1557 0.00927
## 31 Google www.yelp.com 1542 0.00918
## 32 Bing www.homedepot.com 1383 0.00824
## 33 DuckDuckGo www.homedepot.com 1324 0.00788
## 34 Yahoo www.homedepot.com 1301 0.00775
## 35 Yahoo www.pinterest.com 1229 0.00732
## 36 Bing www.pinterest.com 1205 0.00718
## 37 DuckDuckGo www.pinterest.com 1116 0.00665
## 38 DuckDuckGo www.facebook.com 1106 0.00659
## 39 Bing www.facebook.com 1003 0.00597
## 40 Yahoo www.facebook.com 899 0.00535
compare_to_google <-
top_domains_3 %>%
group_by(domain) %>%
arrange(domain, search_engine) %>%
mutate(google_pct = if_else(search_engine == "Google", pct, NA_real_)) %>%
fill(google_pct, .direction = "down") %>%
ungroup() %>%
filter(search_engine != "Google")
print(compare_to_google, n = Inf)
## # A tibble: 30 x 5
## search_engine domain n pct google_pct
## <fct> <fct> <int> <dbl> <dbl>
## 1 DuckDuckGo en.wikipedia.org 8768 0.0522 0.0688
## 2 Bing en.wikipedia.org 8690 0.0517 0.0688
## 3 Yahoo en.wikipedia.org 11362 0.0677 0.0688
## 4 DuckDuckGo www.amazon.com 10110 0.0602 0.0513
## 5 Bing www.amazon.com 9305 0.0554 0.0513
## 6 Yahoo www.amazon.com 9702 0.0578 0.0513
## 7 DuckDuckGo www.youtube.com 3737 0.0223 0.0246
## 8 Bing www.youtube.com 3236 0.0193 0.0246
## 9 Yahoo www.youtube.com 3394 0.0202 0.0246
## 10 DuckDuckGo www.facebook.com 1106 0.00659 0.0154
## 11 Bing www.facebook.com 1003 0.00597 0.0154
## 12 Yahoo www.facebook.com 899 0.00535 0.0154
## 13 DuckDuckGo www.pinterest.com 1116 0.00665 0.0131
## 14 Bing www.pinterest.com 1205 0.00718 0.0131
## 15 Yahoo www.pinterest.com 1229 0.00732 0.0131
## 16 DuckDuckGo www.merriam-webster.com 2273 0.0135 0.0123
## 17 Bing www.merriam-webster.com 2321 0.0138 0.0123
## 18 Yahoo www.merriam-webster.com 2335 0.0139 0.0123
## 19 DuckDuckGo www.imdb.com 2593 0.0154 0.0110
## 20 Bing www.imdb.com 2574 0.0153 0.0110
## 21 Yahoo www.imdb.com 2466 0.0147 0.0110
## 22 DuckDuckGo www.tripadvisor.com 2373 0.0141 0.00985
## 23 Bing www.tripadvisor.com 2503 0.0149 0.00985
## 24 Yahoo www.tripadvisor.com 2375 0.0141 0.00985
## 25 DuckDuckGo www.homedepot.com 1324 0.00788 0.00927
## 26 Bing www.homedepot.com 1383 0.00824 0.00927
## 27 Yahoo www.homedepot.com 1301 0.00775 0.00927
## 28 DuckDuckGo www.yelp.com 2387 0.0142 0.00918
## 29 Bing www.yelp.com 2415 0.0144 0.00918
## 30 Yahoo www.yelp.com 1714 0.0102 0.00918
p <-
ggplot() +
geom_segment(
data = compare_to_google,
aes(x = pct, xend = google_pct, y = search_engine, yend = search_engine, color = search_engine),
size = 1.1, show.legend = FALSE
) +
geom_vline(
data = filter(top_domains_3, search_engine == "Google"),
aes(xintercept = pct), size = 1.1, color = "#574c99"
) +
geom_point(data = filter(top_domains_3, search_engine != "Google"),
aes(pct, search_engine, fill = search_engine), shape = 21, size = 7, color = "white", stroke = 1) +
geom_text(data = filter(top_domains_3, search_engine == "Google", domain == "en.wikipedia.org"),
aes(x = pct, y = 1, label = "Google"), nudge_x = 0.001, hjust = 0, vjust = 1, angle = -270, fontface = "bold", family = "Poppins", size = 5, color = "#574c99", alpha = 0.8) +
labs(
title = "Prevalence of big domains in searches",
subtitle = "Based on Top 3 Results",
x = "Share of presence in top spot",
y = "",
fill = "Search Engine"
) +
facet_wrap(~domain, ncol = 2, strip.position = "top", scales = "free_x") +
scale_x_continuous(limits = c(0, 0.08), breaks = seq(0, 0.08, 0.02), labels = scales::label_percent(accuracy = 0.1), expand = expansion(0)) +
scale_fill_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0"), limits = c("Yahoo", "Bing", "DuckDuckGo")) +
scale_color_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
coord_cartesian(clip = "off") +
guides(
fill = guide_legend(
title.position = "top",
title.hjust = 0.5,
label.theme = element_text(size = 9, face = "bold", color = "grey30"),
nrow = 1,
override.aes = list(size = 3, stroke = 0.6)
)
) +
theme(
plot.title = element_text(face = "bold", margin = margin(b = 5)),
plot.subtitle = element_text(face = "bold", color = "grey40", margin = margin(b = 15)),
axis.text.y = element_blank(),
panel.border = element_rect(size = 0.6, color = "white", fill = NA),
panel.spacing.y = unit(1, "lines"),
panel.spacing.x = unit(3, "lines"),
legend.position = c(0.85, 1.07),
legend.title = element_text(size = rel(0.8), face = "bold", color = "grey60"),
legend.spacing.x = unit(0.5, "mm")
)
ragg::agg_png(here::here("plots", "plot_14_plot_top_domains_top_03_results.png"), width = 9, height = 12, units = "in", res = 320)
print(p)
dev.off()
## png
## 2
top_domains_20 <-
top_domains_20 %>%
mutate(
search_engine = fct_recode(search_engine, DuckDuckGo = "DuckDuck")
)
print(top_domains_20, n = Inf)
## # A tibble: 40 x 4
## search_engine domain n pct
## <fct> <fct> <int> <dbl>
## 1 DuckDuckGo www.amazon.com 37668 0.0336
## 2 Bing www.amazon.com 34198 0.0305
## 3 Yahoo en.wikipedia.org 31544 0.0282
## 4 Yahoo missing_domain 31017 0.0277
## 5 Yahoo www.amazon.com 28358 0.0253
## 6 Bing en.wikipedia.org 28266 0.0252
## 7 DuckDuckGo en.wikipedia.org 28156 0.0251
## 8 DuckDuckGo www.youtube.com 26462 0.0236
## 9 Yahoo www.youtube.com 25186 0.0225
## 10 Bing www.youtube.com 24084 0.0215
## 11 Google www.amazon.com 24063 0.0215
## 12 Google en.wikipedia.org 23631 0.0211
## 13 Google www.pinterest.com 16955 0.0151
## 14 Google www.facebook.com 14581 0.0130
## 15 Google www.youtube.com 13879 0.0124
## 16 DuckDuckGo missing_domain 12508 0.0112
## 17 Bing www.ebay.com 11889 0.0106
## 18 Bing www.facebook.com 11083 0.00990
## 19 DuckDuckGo www.facebook.com 10943 0.00977
## 20 Yahoo www.facebook.com 10229 0.00914
## 21 DuckDuckGo www.tripadvisor.com 9557 0.00854
## 22 Yahoo www.ebay.com 9481 0.00847
## 23 Bing www.tripadvisor.com 9294 0.00830
## 24 Google www.walmart.com 8866 0.00792
## 25 DuckDuckGo www.yelp.com 8851 0.00791
## 26 Yahoo www.tripadvisor.com 8665 0.00774
## 27 Bing www.yelp.com 8564 0.00765
## 28 Google www.ebay.com 7749 0.00692
## 29 Google www.yelp.com 7255 0.00648
## 30 DuckDuckGo www.walmart.com 7149 0.00639
## 31 Bing www.walmart.com 6968 0.00622
## 32 Yahoo www.yelp.com 6789 0.00606
## 33 Yahoo www.pinterest.com 6609 0.00590
## 34 DuckDuckGo www.pinterest.com 6588 0.00588
## 35 Bing www.pinterest.com 6338 0.00566
## 36 Google missing_domain 6202 0.00554
## 37 Google www.tripadvisor.com 5905 0.00527
## 38 Yahoo www.walmart.com 5605 0.00501
## 39 Bing missing_domain 4863 0.00434
## 40 DuckDuckGo www.ebay.com 13 0.0000116
compare_to_google <-
top_domains_20 %>%
filter(domain != "missing_domain") %>%
group_by(domain) %>%
arrange(domain, search_engine) %>%
mutate(google_pct = if_else(search_engine == "Google", pct, NA_real_)) %>%
fill(google_pct, .direction = "down") %>%
ungroup() %>%
filter(search_engine != "Google")
print(compare_to_google, n = Inf)
## # A tibble: 27 x 5
## search_engine domain n pct google_pct
## <fct> <fct> <int> <dbl> <dbl>
## 1 DuckDuckGo www.amazon.com 37668 0.0336 0.0215
## 2 Bing www.amazon.com 34198 0.0305 0.0215
## 3 Yahoo www.amazon.com 28358 0.0253 0.0215
## 4 DuckDuckGo en.wikipedia.org 28156 0.0251 0.0211
## 5 Bing en.wikipedia.org 28266 0.0252 0.0211
## 6 Yahoo en.wikipedia.org 31544 0.0282 0.0211
## 7 DuckDuckGo www.pinterest.com 6588 0.00588 0.0151
## 8 Bing www.pinterest.com 6338 0.00566 0.0151
## 9 Yahoo www.pinterest.com 6609 0.00590 0.0151
## 10 DuckDuckGo www.facebook.com 10943 0.00977 0.0130
## 11 Bing www.facebook.com 11083 0.00990 0.0130
## 12 Yahoo www.facebook.com 10229 0.00914 0.0130
## 13 DuckDuckGo www.youtube.com 26462 0.0236 0.0124
## 14 Bing www.youtube.com 24084 0.0215 0.0124
## 15 Yahoo www.youtube.com 25186 0.0225 0.0124
## 16 DuckDuckGo www.walmart.com 7149 0.00639 0.00792
## 17 Bing www.walmart.com 6968 0.00622 0.00792
## 18 Yahoo www.walmart.com 5605 0.00501 0.00792
## 19 DuckDuckGo www.ebay.com 13 0.0000116 0.00692
## 20 Bing www.ebay.com 11889 0.0106 0.00692
## 21 Yahoo www.ebay.com 9481 0.00847 0.00692
## 22 DuckDuckGo www.yelp.com 8851 0.00791 0.00648
## 23 Bing www.yelp.com 8564 0.00765 0.00648
## 24 Yahoo www.yelp.com 6789 0.00606 0.00648
## 25 DuckDuckGo www.tripadvisor.com 9557 0.00854 0.00527
## 26 Bing www.tripadvisor.com 9294 0.00830 0.00527
## 27 Yahoo www.tripadvisor.com 8665 0.00774 0.00527
p <-
ggplot() +
geom_segment(
data = compare_to_google,
aes(x = pct, xend = google_pct, y = search_engine, yend = search_engine, color = search_engine),
size = 1.1, show.legend = FALSE
) +
geom_vline(
data = filter(top_domains_20, search_engine == "Google", domain != "missing_domain"),
aes(xintercept = pct), size = 1.1, color = "#574c99"
) +
geom_point(data = filter(top_domains_20, search_engine != "Google", domain != "missing_domain"),
aes(pct, search_engine, fill = search_engine), shape = 21, size = 7, color = "white", stroke = 1) +
geom_text(data = filter(top_domains_20, search_engine == "Google", domain == "www.amazon.com", domain != "missing_domain"),
aes(x = pct, y = 1, label = "Google"), nudge_x = -0.005, hjust = 0, vjust = 1, angle = -270, fontface = "bold", family = "Poppins", size = 5, color = "#574c99", alpha = 0.8) +
labs(
title = "Prevalence of big domains in searches",
subtitle = "Based on Top 20 Results",
x = "Share of presence in top spot",
y = "",
fill = "Search Engine"
) +
facet_wrap(~domain, ncol = 3, strip.position = "top", scales = "free_x") +
scale_x_continuous(limits = c(0, 0.05), breaks = seq(0, 0.05, 0.01), labels = scales::label_percent(accuracy = 0.1), expand = expansion(0)) +
scale_fill_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0"), limits = c("Yahoo", "Bing", "DuckDuckGo")) +
scale_color_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
coord_cartesian(clip = "off") +
guides(
fill = guide_legend(
title.position = "top",
title.hjust = 0.5,
label.theme = element_text(size = 9, face = "bold", color = "grey30"),
nrow = 1,
override.aes = list(size = 3, stroke = 0.6)
)
) +
theme(
plot.title = element_text(face = "bold", margin = margin(b = 5)),
plot.subtitle = element_text(face = "bold", color = "grey40", margin = margin(b = 15)),
axis.text.y = element_blank(),
panel.border = element_rect(size = 0.6, color = "white", fill = NA),
panel.spacing.y = unit(1, "lines"),
panel.spacing.x = unit(3, "lines"),
legend.position = c(0.85, 1.09),
legend.title = element_text(size = rel(0.8), face = "bold", color = "grey60"),
legend.spacing.x = unit(0.5, "mm")
)
ragg::agg_png(here::here("plots", "plot_15_plot_top_domains_top_20_results.png"), width = 9, height = 8, units = "in", res = 320)
print(p)
dev.off()
## png
## 2
top_domains_30 <-
top_domains_30 %>%
mutate(
search_engine = fct_recode(search_engine, DuckDuckGo = "DuckDuck")
)
print(top_domains_30, n = Inf)
## # A tibble: 40 x 4
## search_engine domain n pct
## <fct> <fct> <int> <dbl>
## 1 DuckDuckGo missing_domain 113686 0.0677
## 2 Yahoo missing_domain 65534 0.0390
## 3 DuckDuckGo www.amazon.com 47761 0.0284
## 4 Bing www.amazon.com 44886 0.0267
## 5 Yahoo en.wikipedia.org 38661 0.0230
## 6 Yahoo www.amazon.com 36720 0.0219
## 7 DuckDuckGo www.youtube.com 35914 0.0214
## 8 Bing en.wikipedia.org 35907 0.0214
## 9 Yahoo www.youtube.com 35348 0.0210
## 10 DuckDuckGo en.wikipedia.org 35019 0.0209
## 11 Bing www.youtube.com 34768 0.0207
## 12 Google www.amazon.com 29158 0.0174
## 13 Google en.wikipedia.org 26570 0.0158
## 14 Google www.pinterest.com 21487 0.0128
## 15 Bing www.ebay.com 19760 0.0118
## 16 Google www.facebook.com 18995 0.0113
## 17 Bing www.facebook.com 17874 0.0106
## 18 Yahoo www.facebook.com 17237 0.0103
## 19 DuckDuckGo www.facebook.com 17172 0.0102
## 20 Google www.youtube.com 16030 0.00955
## 21 Yahoo www.ebay.com 14940 0.00890
## 22 Bing www.yelp.com 11839 0.00705
## 23 DuckDuckGo www.yelp.com 11516 0.00686
## 24 DuckDuckGo www.etsy.com 11185 0.00666
## 25 Google www.walmart.com 11062 0.00659
## 26 Google www.ebay.com 10661 0.00635
## 27 Yahoo www.pinterest.com 10517 0.00626
## 28 DuckDuckGo www.pinterest.com 10464 0.00623
## 29 Bing www.pinterest.com 10395 0.00619
## 30 Bing www.etsy.com 10279 0.00612
## 31 Google missing_domain 9863 0.00587
## 32 DuckDuckGo www.walmart.com 9858 0.00587
## 33 Bing www.walmart.com 9753 0.00581
## 34 Yahoo www.yelp.com 9540 0.00568
## 35 Google www.yelp.com 8543 0.00509
## 36 Yahoo www.etsy.com 7916 0.00471
## 37 Google www.etsy.com 7738 0.00461
## 38 Bing missing_domain 7559 0.00450
## 39 Yahoo www.walmart.com 7525 0.00448
## 40 DuckDuckGo www.ebay.com 17 0.0000101
compare_to_google <-
top_domains_30 %>%
filter(domain != "missing_domain") %>%
group_by(domain) %>%
arrange(domain, search_engine) %>%
mutate(google_pct = if_else(search_engine == "Google", pct, NA_real_)) %>%
fill(google_pct, .direction = "down") %>%
ungroup() %>%
filter(search_engine != "Google")
print(compare_to_google, n = Inf)
## # A tibble: 27 x 5
## search_engine domain n pct google_pct
## <fct> <fct> <int> <dbl> <dbl>
## 1 DuckDuckGo www.amazon.com 47761 0.0284 0.0174
## 2 Bing www.amazon.com 44886 0.0267 0.0174
## 3 Yahoo www.amazon.com 36720 0.0219 0.0174
## 4 DuckDuckGo en.wikipedia.org 35019 0.0209 0.0158
## 5 Bing en.wikipedia.org 35907 0.0214 0.0158
## 6 Yahoo en.wikipedia.org 38661 0.0230 0.0158
## 7 DuckDuckGo www.pinterest.com 10464 0.00623 0.0128
## 8 Bing www.pinterest.com 10395 0.00619 0.0128
## 9 Yahoo www.pinterest.com 10517 0.00626 0.0128
## 10 DuckDuckGo www.facebook.com 17172 0.0102 0.0113
## 11 Bing www.facebook.com 17874 0.0106 0.0113
## 12 Yahoo www.facebook.com 17237 0.0103 0.0113
## 13 DuckDuckGo www.youtube.com 35914 0.0214 0.00955
## 14 Bing www.youtube.com 34768 0.0207 0.00955
## 15 Yahoo www.youtube.com 35348 0.0210 0.00955
## 16 DuckDuckGo www.walmart.com 9858 0.00587 0.00659
## 17 Bing www.walmart.com 9753 0.00581 0.00659
## 18 Yahoo www.walmart.com 7525 0.00448 0.00659
## 19 DuckDuckGo www.ebay.com 17 0.0000101 0.00635
## 20 Bing www.ebay.com 19760 0.0118 0.00635
## 21 Yahoo www.ebay.com 14940 0.00890 0.00635
## 22 DuckDuckGo www.yelp.com 11516 0.00686 0.00509
## 23 Bing www.yelp.com 11839 0.00705 0.00509
## 24 Yahoo www.yelp.com 9540 0.00568 0.00509
## 25 DuckDuckGo www.etsy.com 11185 0.00666 0.00461
## 26 Bing www.etsy.com 10279 0.00612 0.00461
## 27 Yahoo www.etsy.com 7916 0.00471 0.00461
p <-
ggplot() +
geom_segment(
data = compare_to_google,
aes(x = pct, xend = google_pct, y = search_engine, yend = search_engine, color = search_engine),
size = 1.1, show.legend = FALSE
) +
geom_vline(
data = filter(top_domains_30, search_engine == "Google", domain != "missing_domain"),
aes(xintercept = pct), size = 1.1, color = "#574c99"
) +
geom_point(data = filter(top_domains_30, search_engine != "Google", domain != "missing_domain"),
aes(pct, search_engine, fill = search_engine), shape = 21, size = 7, color = "white", stroke = 1) +
geom_text(data = filter(top_domains_30, search_engine == "Google", domain == "www.amazon.com", domain != "missing_domain"),
aes(x = pct, y = 1, label = "Google"), nudge_x = -0.005, hjust = 0, vjust = 1, angle = -270, fontface = "bold", family = "Poppins", size = 5, color = "#574c99", alpha = 0.8) +
labs(
title = "Prevalence of big domains in searches",
subtitle = "Based on Top 30 Results",
x = "Share of presence in top spot",
y = "",
fill = "Search Engine"
) +
facet_wrap(~domain, ncol = 3, strip.position = "top", scales = "free_x") +
scale_x_continuous(limits = c(0, 0.05), breaks = seq(0, 0.05, 0.01), labels = scales::label_percent(accuracy = 0.1), expand = expansion(0)) +
scale_fill_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0"), limits = c("Yahoo", "Bing", "DuckDuckGo")) +
scale_color_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
coord_cartesian(clip = "off") +
guides(
fill = guide_legend(
title.position = "top",
title.hjust = 0.5,
label.theme = element_text(size = 9, face = "bold", color = "grey30"),
nrow = 1,
override.aes = list(size = 3, stroke = 0.6)
)
) +
theme(
plot.title = element_text(face = "bold", margin = margin(b = 5)),
plot.subtitle = element_text(face = "bold", color = "grey40", margin = margin(b = 15)),
axis.text.y = element_blank(),
panel.border = element_rect(size = 0.6, color = "white", fill = NA),
panel.spacing.y = unit(1, "lines"),
panel.spacing.x = unit(3, "lines"),
legend.position = c(0.85, 1.09),
legend.title = element_text(size = rel(0.8), face = "bold", color = "grey60"),
legend.spacing.x = unit(0.5, "mm")
)
ragg::agg_png(here::here("plots", "plot_16_plot_top_domains_top_30_results.png"), width = 9, height = 8, units = "in", res = 320)
print(p)
dev.off()
## png
## 2
We define the domain specificity of a keyword as the number of top 10 Google results that feature the dominant domain of this top 10. For instance a research that returns 3 ebay results, 5 amazon results, and 2 other results, would be dominated by amazon, so would be considered “amazon specific”, and would be attributed a domain specificity of 5.
For every search engine we observe that searches that are more domain specific tend to yield results less similar to Google’s.
figures_background <-
domain_specificity_by_kw %>%
count(domain_specificity_grouped) %>%
mutate(
x = 1:5,
xmin = x - 0.45,
xmax = x + 0.45,
ymin = n,
ymax = n + 1300
)
p <-
domain_specificity_by_kw %>%
count(domain_specificity_grouped) %>%
ggplot(aes(domain_specificity_grouped, n)) +
geom_col(fill = "#807FFF") +
geom_rect(
data = figures_background,
aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
fill = "#5250F3"
) +
geom_text(aes(label = scales::comma(n)), vjust = -0.5, family = "Poppins", size = 3, fontface = "bold", color = "white") +
labs(
title = "Breakdown of domain specificity in our sample",
x = "Domain Specificity",
y = ""
) +
scale_y_continuous(limits = c(NA, 29000)) +
coord_cartesian(clip = "off", expand = c(0,0)) +
theme(
plot.title = element_text(margin = margin(b = 15)),
panel.grid = element_blank(),
axis.text.y = element_blank()
)
ragg::agg_png(here::here("plots", "plot_17_plot_domain_specificity.png"), width = 6, height = 5, units = "in", res = 320)
print(p)
dev.off()
similarity_by_se_and_spec <-
similarity_by_se_and_spec %>%
mutate(
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine)
)
p <-
similarity_by_se_and_spec %>%
ggplot(aes(domain_specificity_grouped, y = similarity, colour = search_engine,
group = search_engine)) +
geom_line() +
geom_point(aes(fill = search_engine), shape = 21, size = 4, color = "white", stroke = 0.6) +
ggrepel::geom_text_repel(
data = filter(similarity_by_se_and_spec, domain_specificity_grouped == "1"),
aes(label = search_engine), size = 3, hjust = 0, nudge_y = 0.005, fontface = "bold", family = "Poppins"
) +
scale_y_continuous(limits = c(0.20, 0.40), breaks = seq(0.20, 0.40, 0.05), labels = scales::label_percent(accuracy = 1), expand = expansion(0)) +
scale_x_discrete(expand = expansion(0.01)) +
scale_color_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
scale_fill_manual(values = c("Bing" = "#f25f5c", "DuckDuckGo" = "#70c1b3", "Yahoo" = "#247ba0")) +
labs(title = "Search Engine Similarity to Google by Domain Specificity",
subtitle = "Based on Top 10 rankings",
colour = "Search Engine",
x = "Domain specificity",
y = "Search Engine Similarity") +
coord_cartesian(clip = "off") +
theme(
plot.title = element_text(face = "bold", margin = margin(b = 5)),
plot.subtitle = element_text(face = "bold", color = "grey40", margin = margin(b = 15))
)
ragg::agg_png(here::here("plots", "plot_18_plot_se_similarity_domain_specificity.png"), width = 7, height = 4.5, units = "in", res = 320)
print(p)
dev.off()
We take a look at the position at which we find the first Google result in the results of the competitors.
The most likely position of the first Google result is 1st. For every search engine, around 35% of keywords will have Google’s first result on top.
For every search engine, in 12% of the cases, the the first Google result is found in second position.
The second most likely position is that it’s not found at all in the top 30.
A fair amount of google first results are not found at all by other search engines, at least not in the top 30, so we cannot compute a mean position, but we see that for every other search engine, we find google’s top results in the top 3 more than half of the time.
google1_positions <-
google1_positions %>%
mutate(
search_engine = if_else(search_engine == "DuckDuck", "DuckDuckGo", search_engine)
)
search_position_df <-
google1_positions %>%
group_by(search_engine) %>%
mutate(
google1_position = case_when(
google1_position == 1 ~ "Position #1",
google1_position == 2 ~ "Position #2",
is.na(google1_position) ~ "Not Found",
TRUE ~ "Between #3 and #30"
),
google1_position = factor(google1_position, levels = c("Position #1", "Position #2", "Between #3 and #30", "Not Found"))
) %>%
group_by(search_engine, google1_position) %>%
summarise(pct = max(pct), pct_text = glue::glue("{round(pct*100, 0)}%")) %>%
mutate(pct_remainder = 1 - pct) %>%
pivot_longer(cols = c(pct, pct_remainder)) %>%
mutate(
alpha = if_else(name == "pct", 1, 0.15),
name = if_else(name == "pct", as.character(glue::glue("{search_engine}_{name}")), name),
search_engine = factor(search_engine, levels = c("Yahoo", "Bing", "DuckDuckGo"), labels = c("<span style='color:#247ba0'>**Yahoo**</span>", "<span style='color:#f25f5c'>Bing</style>", "<span style='color:#70c1b3'>DuckDuckGo</span>"))
)
p <-
search_position_df %>%
ggplot() +
geom_arc_bar(
aes(x0 = 0, y0 = 0, r0 = 0.7, r = 1, amount = value, fill = name, alpha = alpha),
stat = "pie", color = NA
) +
geom_text(
data = filter(search_position_df, name != "pct_remainder"),
aes(x = 0, y = 0, label = pct_text, color = name), family = "Poppins", size = 6, fontface = "bold"
) +
scale_alpha_identity() +
scale_fill_manual(values = c("Bing_pct" = "#f25f5c", "DuckDuckGo_pct" = "#70c1b3", "Yahoo_pct" = "#247ba0", "pct_remainder" = "grey70")) +
scale_color_manual(values = c("Bing_pct" = "#f25f5c", "DuckDuckGo_pct" = "#70c1b3", "Yahoo_pct" = "#247ba0", "pct_remainder" = "grey70")) +
coord_cartesian(clip = "off") +
facet_grid(search_engine~google1_position, switch = "y") +
labs(
title = str_wrap("Google’s top result does well on other search engines most of the time", width = 40),
x = "",
y = ""
) +
theme(
plot.title = element_text(size = rel(1.8), hjust = 0.5, margin = margin(b = 15)),
panel.grid = element_blank(),
axis.text = element_blank(),
panel.spacing.y = unit(0.5, "lines"),
strip.background = element_blank(),
strip.text.y.left = element_markdown(size = rel(1.5), hjust = 0, angle = 0, margin = margin(r = 15)),
strip.text.x = element_text(size = rel(1.1), color = "grey40", margin = margin(b = 5))
)
ragg::agg_png(here::here("plots", "plot_19_plot_search_position.png"), width = 12, height = 7.9, units = "in", res = 320)
print(p)
dev.off()
## # A tibble: 4 x 2
## search_engine median
## <chr> <dbl>
## 1 Bing 3
## 2 DuckDuck 3
## 3 Google 1
## 4 Yahoo 3