"""
.. _serp:
Import Search Engine Results Pages (SERPs) for Google and YouTube
=================================================================
Analyzing a single SERP is like getting one person to fill out a questionnaire
and calling it a survey.
Just like surveys, SERPs need to be collected in large-enough numbers that are
representative of the industry/market you want to understand. This is the main
feature of the ``serp_`` functions. They allow you to get the SERPs for a list
of queries, across several dimensions (like country, search type, start
position, and so on).
There are many parameters that can be used, and you can supply a list for each.
The function will get the SERPs for the *product* of all those lists. For
example, let's say you you provide the following arguments to the
:func:`serp_goog` function:
* `q`: ['serp tools', 'best serp tools', 'serp tool reviews']
* `gl`: ['us', 'ca', 'uk', 'au', 'nz']
* `start`: [1, 11, 21]
The function will produce:
3 (queries) x 5 (countries) x 3 (start positions) = 45 requests
You typically get ten results each, so in this case you would get 450 rows of
data.
All this is done in with one line of code. The result is a single DataFrame
with a row for each result, and columns for each attribute (title, snippet,
etc.), as well as meta data columns, like `queryTime` and the parameters you
selected (`q`, `gl`, and `start` in this case).
Before being able to run queries using :func:`serp_goog`, you will need to set
up some credentials as follows (you don't need a custom search engine for
:func:`serp_youtube`):
* `Create a custom search engine <https://cse.google.com/>`_: At first, you might be
asked to enter a site to search. Enter any domain, then go to the control panel and
remove it. Make sure you enable "Search the entire web" and image search. You will
also need to get your search engine ID, which you can find on the control panel page.
* `Enable the custom search API <https://console.cloud.google.com/apis/library/customsearch.googleapis.com?pli=1>`_:
The service will allow you to retrieve and display search results from your custom
search engine programmatically. You will need to create a project for this first.
* `Create credentials for this project <https://console.developers.google.com/apis/api/customsearch.googleapis.com/credentials>`_:
so you can get your key.
* `Enable billing for your project <https://console.cloud.google.com/billing/projects>`_
if you want to run more than 100 queries per day. The first 100 queries are free; then
for each additional 1,000 queries, you pay $5.
"""
__all__ = [
"SERP_GOOG_VALID_VALS",
"YOUTUBE_TOPIC_IDS",
"YOUTUBE_VID_CATEGORY_IDS",
"serp_goog",
"serp_youtube",
"set_logging_level",
"youtube_channel_details",
"youtube_video_details",
]
import datetime
import logging
from itertools import product
import pandas as pd
if int(pd.__version__[0]) >= 1:
from pandas import json_normalize
else:
from pandas.io.json import json_normalize
import requests
SERP_GOOG_LOG_FMT = (
"%(asctime)s | %(levelname)s | %(filename)s:%(lineno)d "
"| %(funcName)s | %(message)s"
)
logging.basicConfig(format=SERP_GOOG_LOG_FMT)
##############################################################################
# Google variables
##############################################################################
SERP_GOOG_VALID_VALS = dict(
fileType={
"bas",
"c",
"cc",
"cpp",
"cs",
"cxx",
"doc",
"docx",
"dwf",
"gpx",
"h",
"hpp",
"htm",
"html",
"hwp",
"java",
"kml",
"kmz",
"odp",
"ods",
"odt",
"pdf",
"pl",
"ppt",
"pptx",
"ps",
"py",
"rtf",
"svg",
"swf",
"tex",
"text",
"txt",
"wap",
"wml",
"xls",
"xlsx",
"xml",
},
c2coff={0, 1},
cr={
"countryAF",
"countryAL",
"countryDZ",
"countryAS",
"countryAD",
"countryAO",
"countryAI",
"countryAQ",
"countryAG",
"countryAR",
"countryAM",
"countryAW",
"countryAU",
"countryAT",
"countryAZ",
"countryBS",
"countryBH",
"countryBD",
"countryBB",
"countryBY",
"countryBE",
"countryBZ",
"countryBJ",
"countryBM",
"countryBT",
"countryBO",
"countryBA",
"countryBW",
"countryBV",
"countryBR",
"countryIO",
"countryBN",
"countryBG",
"countryBF",
"countryBI",
"countryKH",
"countryCM",
"countryCA",
"countryCV",
"countryKY",
"countryCF",
"countryTD",
"countryCL",
"countryCN",
"countryCX",
"countryCC",
"countryCO",
"countryKM",
"countryCG",
"countryCD",
"countryCK",
"countryCR",
"countryCI",
"countryHR",
"countryCU",
"countryCY",
"countryCZ",
"countryDK",
"countryDJ",
"countryDM",
"countryDO",
"countryTP",
"countryEC",
"countryEG",
"countrySV",
"countryGQ",
"countryER",
"countryEE",
"countryET",
"countryEU",
"countryFK",
"countryFO",
"countryFJ",
"countryFI",
"countryFR",
"countryFX",
"countryGF",
"countryPF",
"countryTF",
"countryGA",
"countryGM",
"countryGE",
"countryDE",
"countryGH",
"countryGI",
"countryGR",
"countryGL",
"countryGD",
"countryGP",
"countryGU",
"countryGT",
"countryGN",
"countryGW",
"countryGY",
"countryHT",
"countryHM",
"countryVA",
"countryHN",
"countryHK",
"countryHU",
"countryIS",
"countryIN",
"countryID",
"countryIR",
"countryIQ",
"countryIE",
"countryIL",
"countryIT",
"countryJM",
"countryJP",
"countryJO",
"countryKZ",
"countryKE",
"countryKI",
"countryKP",
"countryKR",
"countryKW",
"countryKG",
"countryLA",
"countryLV",
"countryLB",
"countryLS",
"countryLR",
"countryLY",
"countryLI",
"countryLT",
"countryLU",
"countryMO",
"countryMK",
"countryMG",
"countryMW",
"countryMY",
"countryMV",
"countryML",
"countryMT",
"countryMH",
"countryMQ",
"countryMR",
"countryMU",
"countryYT",
"countryMX",
"countryFM",
"countryMD",
"countryMC",
"countryMN",
"countryMS",
"countryMA",
"countryMZ",
"countryMM",
"countryNA",
"countryNR",
"countryNP",
"countryNL",
"countryAN",
"countryNC",
"countryNZ",
"countryNI",
"countryNE",
"countryNG",
"countryNU",
"countryNF",
"countryMP",
"countryNO",
"countryOM",
"countryPK",
"countryPW",
"countryPS",
"countryPA",
"countryPG",
"countryPY",
"countryPE",
"countryPH",
"countryPN",
"countryPL",
"countryPT",
"countryPR",
"countryQA",
"countryRE",
"countryRO",
"countryRU",
"countryRW",
"countrySH",
"countryKN",
"countryLC",
"countryPM",
"countryVC",
"countryWS",
"countrySM",
"countryST",
"countrySA",
"countrySN",
"countryCS",
"countrySC",
"countrySL",
"countrySG",
"countrySK",
"countrySI",
"countrySB",
"countrySO",
"countryZA",
"countryGS",
"countryES",
"countryLK",
"countrySD",
"countrySR",
"countrySJ",
"countrySZ",
"countrySE",
"countryCH",
"countrySY",
"countryTW",
"countryTJ",
"countryTZ",
"countryTH",
"countryTG",
"countryTK",
"countryTO",
"countryTT",
"countryTN",
"countryTR",
"countryTM",
"countryTC",
"countryTV",
"countryUG",
"countryUA",
"countryAE",
"countryUK",
"countryUS",
"countryUM",
"countryUY",
"countryUZ",
"countryVU",
"countryVE",
"countryVN",
"countryVG",
"countryVI",
"countryWF",
"countryEH",
"countryYE",
"countryYU",
"countryZM",
"countryZW",
},
gl={
"ad",
"ae",
"af",
"ag",
"ai",
"al",
"am",
"an",
"ao",
"aq",
"ar",
"as",
"at",
"au",
"aw",
"az",
"ba",
"bb",
"bd",
"be",
"bf",
"bg",
"bh",
"bi",
"bj",
"bm",
"bn",
"bo",
"br",
"bs",
"bt",
"bv",
"bw",
"by",
"bz",
"ca",
"cc",
"cd",
"cf",
"cg",
"ch",
"ci",
"ck",
"cl",
"cm",
"cn",
"co",
"cr",
"cs",
"cu",
"cv",
"cx",
"cy",
"cz",
"de",
"dj",
"dk",
"dm",
"do",
"dz",
"ec",
"ee",
"eg",
"eh",
"er",
"es",
"et",
"fi",
"fj",
"fk",
"fm",
"fo",
"fr",
"ga",
"gd",
"ge",
"gf",
"gh",
"gi",
"gl",
"gm",
"gn",
"gp",
"gq",
"gr",
"gs",
"gt",
"gu",
"gw",
"gy",
"hk",
"hm",
"hn",
"hr",
"ht",
"hu",
"id",
"ie",
"il",
"in",
"io",
"iq",
"ir",
"is",
"it",
"jm",
"jo",
"jp",
"ke",
"kg",
"kh",
"ki",
"km",
"kn",
"kp",
"kr",
"kw",
"ky",
"kz",
"la",
"lb",
"lc",
"li",
"lk",
"lr",
"ls",
"lt",
"lu",
"lv",
"ly",
"ma",
"mc",
"md",
"mg",
"mh",
"mk",
"ml",
"mm",
"mn",
"mo",
"mp",
"mq",
"mr",
"ms",
"mt",
"mu",
"mv",
"mw",
"mx",
"my",
"mz",
"na",
"nc",
"ne",
"nf",
"ng",
"ni",
"nl",
"no",
"np",
"nr",
"nu",
"nz",
"om",
"pa",
"pe",
"pf",
"pg",
"ph",
"pk",
"pl",
"pm",
"pn",
"pr",
"ps",
"pt",
"pw",
"py",
"qa",
"re",
"ro",
"ru",
"rw",
"sa",
"sb",
"sc",
"sd",
"se",
"sg",
"sh",
"si",
"sj",
"sk",
"sl",
"sm",
"sn",
"so",
"sr",
"st",
"sv",
"sy",
"sz",
"tc",
"td",
"tf",
"tg",
"th",
"tj",
"tk",
"tl",
"tm",
"tn",
"to",
"tr",
"tt",
"tv",
"tw",
"tz",
"ua",
"ug",
"uk",
"um",
"us",
"uy",
"uz",
"va",
"vc",
"ve",
"vg",
"vi",
"vn",
"vu",
"wf",
"ws",
"ye",
"yt",
"za",
"zm",
"zw",
},
filter={0, 1},
hl={
"af",
"sq",
"sm",
"ar",
"az",
"eu",
"be",
"bn",
"bh",
"bs",
"bg",
"ca",
"zh-CN",
"zh-TW",
"hr",
"cs",
"da",
"nl",
"en",
"eo",
"et",
"fo",
"fi",
"fr",
"fy",
"gl",
"ka",
"de",
"el",
"gu",
"iw",
"hi",
"hu",
"is",
"id",
"ia",
"ga",
"it",
"ja",
"jw",
"kn",
"ko",
"la",
"lv",
"lt",
"mk",
"ms",
"ml",
"mt",
"mr",
"ne",
"no",
"nn",
"oc",
"fa",
"pl",
"pt-BR",
"pt-PT",
"pa",
"ro",
"ru",
"gd",
"sr",
"si",
"sk",
"sl",
"es",
"su",
"sw",
"sv",
"tl",
"ta",
"te",
"th",
"ti",
"tr",
"uk",
"ur",
"uz",
"vi",
"cy",
"xh",
"zu",
},
imgColorType={"color", "gray", "mono", "trans"},
imgDominantColor={
"black",
"blue",
"brown",
"gray",
"green",
"orange",
"pink",
"purple",
"red",
"teal",
"white",
"yellow",
},
imgSize={
"huge",
"icon",
"large",
"medium",
"small",
"xlarge",
"xxlarge",
},
imgType={"clipart", "face", "lineart", "stock", "photo", "animated"},
lr={
"lang_ar",
"lang_bg",
"lang_ca",
"lang_zh-CN",
"lang_zh-TW",
"lang_hr",
"lang_cs",
"lang_da",
"lang_nl",
"lang_en",
"lang_et",
"lang_fi",
"lang_fr",
"lang_de",
"lang_el",
"lang_iw",
"lang_hu",
"lang_is",
"lang_id",
"lang_it",
"lang_ja",
"lang_ko",
"lang_lv",
"lang_lt",
"lang_no",
"lang_pl",
"lang_pt",
"lang_ro",
"lang_ru",
"lang_sr",
"lang_sk",
"lang_sl",
"lang_es",
"lang_sv",
"lang_tr",
},
num={1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
rights={
"cc_publicdomain",
"cc_attribute",
"cc_sharealike",
"cc_noncommercial",
"cc_nonderived",
},
safe={"active", "off"},
searchType={None, "image"},
siteSearchFilter={"e", "i"},
start=range(1, 92),
)
##############################################################################
# YouTube variables
##############################################################################
YOUTUBE_TOPIC_IDS = {
"Entertainment topics": {
"Entertainment (parent topic)": "/m/02jjt",
"Humor": "/m/09kqc",
"Movies": "/m/02vxn",
"Performing arts": "/m/05qjc",
"Professional wrestling": "/m/066wd",
"TV shows": "/m/0f2f9",
},
"Gaming topics": {
"Action game": "/m/025zzc",
"Action-adventure game": "/m/02ntfj",
"Casual game": "/m/0b1vjn",
"Gaming (parent topic)": "/m/0bzvm2",
"Music video game": "/m/02hygl",
"Puzzle video game": "/m/04q1x3q",
"Racing video game": "/m/01sjng",
"Role-playing video game": "/m/0403l3g",
"Simulation video game": "/m/021bp2",
"Sports game": "/m/022dc6",
"Strategy video game": "/m/03hf_rm",
},
"Lifestyle topics": {
"Fashion": "/m/032tl",
"Fitness": "/m/027x7n",
"Food": "/m/02wbm",
"Hobby": "/m/03glg",
"Lifestyle (parent topic)": "/m/019_rr",
"Pets": "/m/068hy",
"Physical attractiveness [Beauty]": "/m/041xxh",
"Technology": "/m/07c1v",
"Tourism": "/m/07bxq",
"Vehicles": "/m/07yv9",
},
"Music topics": {
"Christian music": "/m/02mscn",
"Classical music": "/m/0ggq0m",
"Country": "/m/01lyv",
"Electronic music": "/m/02lkt",
"Hip hop music": "/m/0glt670",
"Independent music": "/m/05rwpb",
"Jazz": "/m/03_d0",
"Music (parent topic)": "/m/04rlf",
"Music of Asia": "/m/028sqc",
"Music of Latin America": "/m/0g293",
"Pop music": "/m/064t9",
"Reggae": "/m/06cqb",
"Rhythm and blues": "/m/06j6l",
"Rock music": "/m/06by7",
"Soul music": "/m/0gywn",
},
"Other topics": {"Knowledge": "/m/01k8wb"},
"Society topics": {
"Business": "/m/09s1f",
"Health": "/m/0kt51",
"Military": "/m/01h6rj",
"Politics": "/m/05qt0",
"Religion": "/m/06bvp",
"Society (parent topic)": "/m/098wr",
},
"Sports topics": {
"American football": "/m/0jm_",
"Baseball": "/m/018jz",
"Basketball": "/m/018w8",
"Boxing": "/m/01cgz",
"Cricket": "/m/09xp_",
"Football": "/m/02vx4",
"Golf": "/m/037hz",
"Ice hockey": "/m/03tmr",
"Mixed martial arts": "/m/01h7lh",
"Motorsport": "/m/0410tth",
"Sports (parent topic)": "/m/06ntj",
"Tennis": "/m/07bs0",
"Volleyball": "/m/07_53",
},
}
YOUTUBE_VID_CATEGORY_IDS = {
"Action/Adventure": "32",
"Anime/Animation": "31",
"Autos & Vehicles": "2",
"Classics": "33",
"Comedy": "34",
"Documentary": "35",
"Drama": "36",
"Education": "27",
"Entertainment": "24",
"Family": "37",
"Film & Animation": "1",
"Foreign": "38",
"Gaming": "20",
"Horror": "39",
"Howto & Style": "26",
"Movies": "30",
"Music": "10",
"News & Politics": "25",
"Nonprofits & Activism": "29",
"People & Blogs": "22",
"Pets & Animals": "15",
"Sci-Fi/Fantasy": "40",
"Science & Technology": "28",
"Short Movies": "18",
"Shorts": "42",
"Shows": "43",
"Sports": "17",
"Thriller": "41",
"Trailers": "44",
"Travel & Events": "19",
"Videoblogging": "21",
}
SERP_YTUBE_VALID_VALS = dict(
channelType={"any", "show"},
eventType={"completed", "live", "upcoming"},
forContentOwner={True, False, "true", "false"},
forDeveloper={True, False, "true", "false"},
forMine={True, False, "true", "false"},
maxResults=range(51),
order={"date", "rating", "relevance", "title", "videoCount", "viewCount"},
regionCode={
"ad",
"ae",
"af",
"ag",
"ai",
"al",
"am",
"an",
"ao",
"aq",
"ar",
"as",
"at",
"au",
"aw",
"az",
"ba",
"bb",
"bd",
"be",
"bf",
"bg",
"bh",
"bi",
"bj",
"bm",
"bn",
"bo",
"br",
"bs",
"bt",
"bv",
"bw",
"by",
"bz",
"ca",
"cc",
"cd",
"cf",
"cg",
"ch",
"ci",
"ck",
"cl",
"cm",
"cn",
"co",
"cr",
"cs",
"cu",
"cv",
"cx",
"cy",
"cz",
"de",
"dj",
"dk",
"dm",
"do",
"dz",
"ec",
"ee",
"eg",
"eh",
"er",
"es",
"et",
"fi",
"fj",
"fk",
"fm",
"fo",
"fr",
"ga",
"gd",
"ge",
"gf",
"gh",
"gi",
"gl",
"gm",
"gn",
"gp",
"gq",
"gr",
"gs",
"gt",
"gu",
"gw",
"gy",
"hk",
"hm",
"hn",
"hr",
"ht",
"hu",
"id",
"ie",
"il",
"in",
"io",
"iq",
"ir",
"is",
"it",
"jm",
"jo",
"jp",
"ke",
"kg",
"kh",
"ki",
"km",
"kn",
"kp",
"kr",
"kw",
"ky",
"kz",
"la",
"lb",
"lc",
"li",
"lk",
"lr",
"ls",
"lt",
"lu",
"lv",
"ly",
"ma",
"mc",
"md",
"mg",
"mh",
"mk",
"ml",
"mm",
"mn",
"mo",
"mp",
"mq",
"mr",
"ms",
"mt",
"mu",
"mv",
"mw",
"mx",
"my",
"mz",
"na",
"nc",
"ne",
"nf",
"ng",
"ni",
"nl",
"no",
"np",
"nr",
"nu",
"nz",
"om",
"pa",
"pe",
"pf",
"pg",
"ph",
"pk",
"pl",
"pm",
"pn",
"pr",
"ps",
"pt",
"pw",
"py",
"qa",
"re",
"ro",
"ru",
"rw",
"sa",
"sb",
"sc",
"sd",
"se",
"sg",
"sh",
"si",
"sj",
"sk",
"sl",
"sm",
"sn",
"so",
"sr",
"st",
"sv",
"sy",
"sz",
"tc",
"td",
"tf",
"tg",
"th",
"tj",
"tk",
"tl",
"tm",
"tn",
"to",
"tr",
"tt",
"tv",
"tw",
"tz",
"ua",
"ug",
"uk",
"um",
"us",
"uy",
"uz",
"va",
"vc",
"ve",
"vg",
"vi",
"vn",
"vu",
"wf",
"ws",
"ye",
"yt",
"za",
"zm",
"zw",
},
relevanceLanguage={
"af",
"sq",
"sm",
"ar",
"az",
"eu",
"be",
"bn",
"bh",
"bs",
"bg",
"ca",
"zh-CN",
"zh-TW",
"zh-Hans",
"zh-Hant",
"hr",
"cs",
"da",
"nl",
"en",
"eo",
"et",
"fo",
"fi",
"fr",
"fy",
"gl",
"ka",
"de",
"el",
"gu",
"iw",
"hi",
"hu",
"is",
"id",
"ia",
"ga",
"it",
"ja",
"jw",
"kn",
"ko",
"la",
"lv",
"lt",
"mk",
"ms",
"ml",
"mt",
"mr",
"ne",
"no",
"nn",
"oc",
"fa",
"pl",
"pt-BR",
"pt-PT",
"pa",
"ro",
"ru",
"gd",
"sr",
"si",
"sk",
"sl",
"es",
"su",
"sw",
"sv",
"tl",
"ta",
"te",
"th",
"ti",
"tr",
"uk",
"ur",
"uz",
"vi",
"cy",
"xh",
"zu",
},
safeSearch={"moderate", "none", "strict"},
topicId={
"/m/04rlf",
"/m/02mscn",
"/m/0ggq0m",
"/m/01lyv",
"/m/02lkt",
"/m/0glt670",
"/m/05rwpb",
"/m/03_d0",
"/m/028sqc",
"/m/0g293",
"/m/064t9",
"/m/06cqb",
"/m/06j6l",
"/m/06by7",
"/m/0gywn",
"/m/0bzvm2",
"/m/025zzc",
"/m/02ntfj",
"/m/0b1vjn",
"/m/02hygl",
"/m/04q1x3q",
"/m/01sjng",
"/m/0403l3g",
"/m/021bp2",
"/m/022dc6",
"/m/03hf_rm",
"/m/06ntj",
"/m/0jm_",
"/m/018jz",
"/m/018w8",
"/m/01cgz",
"/m/09xp_",
"/m/02vx4",
"/m/037hz",
"/m/03tmr",
"/m/01h7lh",
"/m/0410tth",
"/m/07bs0",
"/m/07_53",
"/m/02jjt",
"/m/09kqc",
"/m/02vxn",
"/m/05qjc",
"/m/066wd",
"/m/0f2f9",
"/m/019_rr",
"/m/032tl",
"/m/027x7n",
"/m/02wbm",
"/m/03glg",
"/m/068hy",
"/m/041xxh",
"/m/07c1v",
"/m/07bxq",
"/m/07yv9",
"/m/098wr",
"/m/09s1f",
"/m/0kt51",
"/m/01h6rj",
"/m/05qt0",
"/m/06bvp",
"/m/01k8wb",
},
type={"channel", "playlist", "video"},
videoCaption={"any", "closedCaption", "none"},
videoCategoryId={
"1",
"2",
"10",
"15",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"32",
"33",
"34",
"35",
"36",
"37",
"38",
"39",
"40",
"41",
"42",
"43",
"44",
},
videoDefinition={"any", "high", "standard"},
videoDimension={"2d", "3d", "any"},
videoDuration={"any", "long", "medium", "short"},
videoEmbeddable={"any", True, "true"},
videoLicense={"any", "creativeCommon", "youtube"},
videoSyndicated={"any", True, "true"},
videoType={"any", "episode", "movie"},
)
def _split_by_comma(s, length=50):
"""Group a comma-separated string into a list of at-most
``length``-length words each."""
str_split = s.split(",")
str_list = []
for i in range(0, len(str_split) + length, length):
temp_str = ",".join(str_split[i : i + length])
if temp_str:
str_list.append(temp_str)
return str_list
[docs]
def youtube_video_details(key, vid_ids):
"""Return details of videos for which the ids are given.
Assumes ``ids`` is a comma-separated list of video ids with
no spaces.
Parameters
----------
key : str
Your Google Developer key.
vid_ids : str
A comma-separated list of video ID's, with no spaces.
Returns
-------
video_df : pandas.DataFrame
"""
base_url = (
"https://www.googleapis.com/youtube/v3/videos?part="
"contentDetails,id,liveStreamingDetails,localizations,player,"
"recordingDetails,snippet,statistics,status,topicDetails"
)
vid_ids = _split_by_comma(vid_ids, length=50)
final_df = pd.DataFrame()
for vid_id in vid_ids:
params = {"id": vid_id, "key": key}
logging.info(msg="Requesting: " + "video details")
video_resp = requests.get(base_url, params=params)
if video_resp.status_code >= 400:
raise Exception(video_resp.json())
items_df = pd.DataFrame(video_resp.json()["items"])
details = ["snippet", "topicDetails", "statistics", "status", "contentDetails"]
detail_df = pd.DataFrame()
for detail in details:
try:
detail_df = pd.concat(
[
detail_df,
pd.DataFrame([x[detail] for x in video_resp.json()["items"]]),
],
axis=1,
)
except KeyError:
continue
temp_df = pd.concat([items_df, detail_df], axis=1)
final_df = pd.concat([final_df, temp_df], sort=False, ignore_index=True)
return final_df
[docs]
def youtube_channel_details(key, channel_ids):
"""Return details of channels for which the ids are given.
Assumes ``ids`` is a comma-separated list of channel ids with
no spaces.
Parameters
----------
key : str
Your Google Developer key.
channel_ids : str
A comma-separated list of channel ID's, with no spaces.
Returns
-------
channel_df : pandas.DataFrame
"""
base_url = (
"https://www.googleapis.com/youtube/v3/channels?part="
"snippet,contentDetails,statistics"
)
channel_ids = _split_by_comma(channel_ids, length=50)
final_df = pd.DataFrame()
for channel_id in channel_ids:
params = {"id": channel_id, "key": key}
logging.info(msg="Requesting: " + "channel details")
channel_resp = requests.get(base_url, params=params)
if channel_resp.status_code >= 400:
raise Exception(channel_resp.json())
items_df = pd.DataFrame(channel_resp.json()["items"])
details = ["snippet", "statistics", "contentDetails"]
detail_df = pd.DataFrame()
for detail in details:
try:
detail_df = pd.concat(
[
detail_df,
pd.DataFrame([x[detail] for x in channel_resp.json()["items"]]),
],
axis=1,
)
except KeyError:
continue
temp_df = pd.concat([items_df, detail_df], axis=1)
final_df = pd.concat([final_df, temp_df], sort=False, ignore_index=True)
return final_df
def _dict_product(d):
"""Return the product of all values of a dict, while
coupling each value with its key.
This is used to generate multiple queries out of
possibly multiple arguments in serp_goog.
>>> d = {"a": [1], "b": [2, 3, 4], "c": [5, 6]}
>>> _dict_product(d)
>>> [{'a': 1, 'b': 2, 'c': 5},
{'a': 1, 'b': 2, 'c': 6},
{'a': 1, 'b': 3, 'c': 5},
{'a': 1, 'b': 3, 'c': 6},
{'a': 1, 'b': 4, 'c': 5},
{'a': 1, 'b': 4, 'c': 6}]
"""
items = list(d.items())
keys = [x[0] for x in items]
values = [x[1] for x in items]
dicts = []
for prod in product(*values):
tempdict = dict(zip(keys, prod))
dicts.append(tempdict)
return dicts
[docs]
def serp_goog(
q,
cx,
key,
c2coff=None,
cr=None,
dateRestrict=None,
exactTerms=None,
excludeTerms=None,
fileType=None,
filter=None,
gl=None,
highRange=None,
hl=None,
hq=None,
imgColorType=None,
imgDominantColor=None,
imgSize=None,
imgType=None,
linkSite=None,
lowRange=None,
lr=None,
num=None,
orTerms=None,
rights=None,
safe=None,
searchType=None,
siteSearch=None,
siteSearchFilter=None,
sort=None,
start=None,
):
"""Query Google's search API and get search results in a DataFrame.
For each parameter, you can supply single or multiple values / arguments.
If you pass multiple arguments, all the possible combinations of
arguments (the product) will be requested, and you will get one
DataFrame combining all queries. See examples below.
Parameters
----------
q : str
The search expression.
cx : str
The custom search engine ID to use for this request.
key : str
The API key of your custom search engine.
c2coff : str
Enables or disables Simplified and Traditional Chinese Search. The default value
for this parameter is 0 (zero), meaning that the feature is enabled. Supported
values are:1: Disabled0: Enabled (default)
cr : str
Restricts search results to documents originating in a particular country. You may
use Boolean operators in the cr parameter's value.Google Search determines the
country of a document by analyzing:the top- level domain (TLD) of the document's
URLthe geographic location of the Web server's IP addressSee the Country Parameter
Values page for a list of valid values for this parameter.
dateRestrict : str
Restricts results to URLs based on date.
Supported values include:
- d[number]: requests results from the specified number of past days.
- w[number]: requests results from the specified number of past weeks.
- m[number]: requests results from the specified number of past months.
- y[number]: requests results from the specified number of past years.
exactTerms : str
Identifies a phrase that all documents in the search results must contain.
excludeTerms : str
Identifies a word or phrase that should not appear in any documents in the search
results.
fileType : str
Restricts results to files of a specified extension. A list of file types
indexable by Google can be found in Search Console Help Center.
filter : str
Controls turning on or off the duplicate content filter.See Automatic Filtering
for more information about Google's search results filters. Note that host
crowding filtering applies only to multi-site searches.By default, Google applies
filtering to all search results to improve the quality of those results.
Acceptable values are: "0": Turns off duplicate content filter. "1": Turns on
duplicate content filter.
gl : str
Geolocation of end user. The gl parameter value is a two-letter country code. The
gl parameter boosts search results whose country of origin matches the parameter
value. See the Country Codes page for a list of valid values.Specifying a gl
parameter value should lead to more relevant results. This is particularly true
for international customers and, even more specifically, for customers in
English- speaking countries other than the United States.
highRange : str
Specifies the ending value for a search range.Use lowRange and highRange to append
an inclusive search range of lowRange...highRange to the query.
hl : str
Sets the user interface language. Explicitly setting this parameter improves the
performance and the quality of your search results.See the Interface Languages
section of Internationalizing Queries and Results Presentation for more
information, and Supported Interface Languages for a list of supported languages.
hq : str
Appends the specified query terms to the query, as if they were combined with a
logical AND operator.
imgColorType : str
Returns black and white, grayscale, or color images: mono, gray, and color.
Acceptable values are: "color": color "gray": gray "mono": mono
imgDominantColor : str
Returns images of a specific dominant color. Acceptable values are:
"black": black "blue": blue "brown": brown "gray": gray "green": green
"orange": orange "pink": pink "purple": purple "red": red "teal": teal
"white": white "yellow": yellow
imgSize : str
Returns images of a specified size. Acceptable values are: "huge": huge
"icon": icon "large": large "medium": medium "small": small "xlarge": xlarge
"xxlarge": xxlarge
imgType : str
Returns images of a type. Acceptable values are: "clipart": clipart
"face": face "lineart": lineart "news": news "photo": photo
linkSite : str
Specifies that all search results should contain a link to a particular URL
lowRange : str
Specifies the starting value for a search range. Use lowRange and highRange to
append an inclusive search range of lowRange...highRange to the query.
lr : str
Restricts the search to documents written in a particular language
(e.g., lr=lang_ja). Acceptable values are: "lang_ar": Arabic
"lang_bg": Bulgarian "lang_ca": Catalan "lang_cs": Czech "lang_da": Danish
"lang_de": German "lang_el": Greek "lang_en": English "lang_es": Spanish
"lang_et": Estonian "lang_fi": Finnish "lang_fr": French "lang_hr": Croatian
"lang_hu": Hungarian "lang_id": Indonesian "lang_is": Icelandic
"lang_it": Italian "lang_iw": Hebrew "lang_ja": Japanese "lang_ko": Korean
"lang_lt": Lithuanian "lang_lv": Latvian "lang_nl": Dutch "lang_no": Norwegian
"lang_pl": Polish "lang_pt": Portuguese "lang_ro": Romanian "lang_ru": Russian
"lang_sk": Slovak "lang_sl": Slovenian "lang_sr": Serbian "lang_sv": Swedish
"lang_tr": Turkish "lang_zh- CN": Chinese (Simplified) "lang_zh-TW":
Chinese (Traditional)
num : int
Number of search results to return.Valid values are integers between 1 and 10,
inclusive.
orTerms : str
Provides additional search terms to check for in a document, where each document
in the search results must contain at least one of the additional search terms.
rights : str
Filters based on licensing. Supported values include: cc_publicdomain,
cc_attribute, cc_sharealike, cc_noncommercial, cc_nonderived, and combinations of
these.
safe : str
Search safety level. Acceptable values are: "active": Enables SafeSearch
filtering. "off":Disables SafeSearch filtering. (default)
searchType : str
Specifies the search type: image. If unspecified, results are limited to webpages.
Acceptable values are: "image": custom image search.
siteSearch : str
Specifies all search results should be pages from a given site.
siteSearchFilter : str
Controls whether to include or exclude results from the site named in the
siteSearch parameter. Acceptable values are: "e": exclude "i": include
sort : str
The sort expression to apply to the results.
start : int
The index of the first result to return.Valid value are integers starting 1
(default) and the second result is 2 and so forth. For example &start=11 gives the
second page of results with the default "num" value of 10 results per page.Note:
No more than 100 results will ever be returned for any query with JSON API, even
if more than 100 documents match the query, so setting (start + num) to more than
100 will produce an error. Note that the maximum value for num is 10.
Returns
-------
serp_df : pandas.DataFrame
Examples
--------
The following function call will produce two queries:
"hotel" in the USA, and "hotel" in France
>>> serp_goog(q="hotel", gl=["us", "fr"], cx="YOUR_CX", key="YOUR_KEY")
The below function call will prouce four queries and make four requests:
* "fligts" in UK
* "fligts" in Australia
* "tickets" in UK
* "tickets" in Australia
'cr' here refers to 'country restrict', which focuses on content
originating from the specified country.
>>> serp_goog(q=['flights', 'tickets'], cr=['countryUK', 'countryAU'],
cx='YOUR_CX', key='YOUR_KEY')
"""
params = locals()
supplied_params = {k: v for k, v in params.items() if params[k] is not None}
for p in supplied_params:
if isinstance(supplied_params[p], (str, int)):
supplied_params[p] = [supplied_params[p]]
for p in supplied_params:
if p in SERP_GOOG_VALID_VALS:
if not set(supplied_params[p]).issubset(SERP_GOOG_VALID_VALS[p]):
raise ValueError(
"Please make sure you provide a"
' valid value for "{}", valid values:\n'
"{}".format(p, sorted(SERP_GOOG_VALID_VALS[p]))
)
params_list = _dict_product(supplied_params)
base_url = "https://www.googleapis.com/customsearch/v1?"
specified_cols = [
"searchTerms",
"rank",
"title",
"snippet",
"displayLink",
"link",
"queryTime",
"totalResults",
]
responses = []
for param in params_list:
param_log = ", ".join([k + "=" + str(v) for k, v in param.items()])
logging.info(msg="Requesting: " + param_log)
resp = requests.get(base_url, params=param)
if resp.status_code >= 400:
raise Exception(resp.json())
responses.append(resp)
result_df = pd.DataFrame()
for i, resp in enumerate(responses):
request_metadata = resp.json()["queries"]["request"][0]
del request_metadata["title"]
search_info = resp.json()["searchInformation"]
if int(search_info["totalResults"]) == 0:
df = pd.DataFrame(columns=specified_cols, index=range(1))
df["searchTerms"] = request_metadata["searchTerms"]
# These keys don't appear in the response so they have to be
# added manually
for missing in ["lr", "num", "start", "c2coff"]:
if missing in params_list[i]:
df[missing] = params_list[i][missing]
else:
df = pd.DataFrame(resp.json()["items"])
df["cseName"] = resp.json()["context"]["title"]
start_idx = request_metadata["startIndex"]
df["rank"] = range(start_idx, start_idx + len(df))
for missing in ["lr", "num", "start", "c2coff"]:
if missing in params_list[i]:
df[missing] = params_list[i][missing]
meta_columns = {**request_metadata, **search_info}
df = df.assign(**meta_columns)
df["queryTime"] = datetime.datetime.now(tz=datetime.timezone.utc)
df["queryTime"] = pd.to_datetime(df["queryTime"])
if "image" in df:
img_df = json_normalize(df["image"])
img_df.columns = ["image." + c for c in img_df.columns]
df = pd.concat([df, img_df], axis=1)
result_df = pd.concat([result_df, df], sort=False, ignore_index=True)
ordered_cols = (
list(set(params_list[i]).difference({"q", "key", "cx"})) + specified_cols
)
non_ordered = result_df.columns.difference(set(ordered_cols))
final_df = result_df[ordered_cols + list(non_ordered)]
if "pagemap" in final_df:
pagemap_df = pd.DataFrame()
for p in final_df["pagemap"]:
try:
temp_pagemap_df = json_normalize(p)
pagemap_df = pd.concat([pagemap_df, temp_pagemap_df], sort=False)
except Exception:
temp_pagemap_df = pd.DataFrame({"delete_me": None}, index=range(1))
pagemap_df = pd.concat([pagemap_df, temp_pagemap_df], sort=False)
pagemap_df = pagemap_df.reset_index(drop=True)
if "delete_me" in pagemap_df:
del pagemap_df["delete_me"]
for col in pagemap_df:
if col in final_df:
pagemap_df = pagemap_df.rename(columns={col: "pagemap_" + col})
final_df = pd.concat([final_df, pagemap_df], axis=1)
if "metatags" in pagemap_df:
metatag_df = pd.DataFrame()
for m in pagemap_df["metatags"]:
try:
temp_metatags_df = json_normalize(m)
metatag_df = pd.concat([metatag_df, temp_metatags_df], sort=False)
except Exception:
temp_metatags_df = pd.DataFrame({"delete_me": None}, index=range(1))
metatag_df = pd.concat([metatag_df, temp_metatags_df], sort=False)
metatag_df = metatag_df.reset_index(drop=True)
if "delete_me" in metatag_df:
del metatag_df["delete_me"]
for col in metatag_df:
if col in final_df:
metatag_df = metatag_df.rename(columns={col: "metatag_" + col})
final_df = pd.concat([final_df, metatag_df], axis=1)
return final_df
[docs]
def serp_youtube(
key,
q=None,
channelId=None,
channelType=None,
eventType=None,
forContentOwner=None,
forDeveloper=None,
forMine=None,
location=None,
locationRadius=None,
maxResults=None,
onBehalfOfContentOwner=None,
order=None,
pageToken=None,
publishedAfter=None,
publishedBefore=None,
regionCode=None,
relatedToVideoId=None,
relevanceLanguage=None,
safeSearch=None,
topicId=None,
type=None,
videoCaption=None,
videoCategoryId=None,
videoDefinition=None,
videoDimension=None,
videoDuration=None,
videoEmbeddable=None,
videoLicense=None,
videoSyndicated=None,
videoType=None,
):
"""Query the YouTube API and get search results in a DataFrame.
For each parameter you can supply a single or multiple value(s).
Looping and merging results is handled automatically in case of multiple
values.
Parameters
----------
q : str
The ``q`` parameter specifies the query term to search for. Your request can also
use the Boolean NOT (-) and OR (|) operators to exclude videos or to find videos
that are associated with one of several search terms. For example, to search for
videos matching either "boating" or "sailing", set the ``q`` parameter value to
boating|sailing. Similarly, to search for videos matching either "boating" or
"sailing" but not "fishing", set the q parameter value to
boating|sailing -fishing. Note that the pipe character must be URL- escaped when
it is sent in your API request. The URL-escaped value for the pipe character is
%7C.
channelId : str
The ``channelId`` parameter indicates that the API response should only contain
resources created by the channel. Note: Search results are constrained to a
maximum of 500 videos if your request specifies a value for the ``channelId``
parameter and sets the ``type`` parameter value to video, but it does not also set
one of the ``forContentOwner``, ``forDeveloper``, or ``forMine`` filters.
channelType : str
The ``channelType`` parameter lets you restrict a search to a particular type of
channel. Acceptable values are:
any - Return all channels.
show - Only retrieve shows.
eventType : str
The ``eventType`` parameter restricts a search to broadcast events. If you specify
a value for this parameter, you must also set the type parameter's value to video.
Acceptable values are:
completed - Only include completed broadcasts.
live - Only include active broadcasts.
upcoming - Only include upcoming broadcasts.
forContentOwner : bool
This parameter can only be used in a properly authorized request, and it is
intended exclusively for YouTube content partners. The ``forContentOwner``
parameter restricts the search to only retrieve videos owned by the content owner
identified by the ``onBehalfOfContentOwner`` parameter. If ``forContentOwner`` is
set to true, the request must also meet these requirements: The
``onBehalfOfContentOwner`` parameter is required.The user authorizing the request
must be using an account linked to the specified content owner. The ``type``
parameter value must be set to video.None of the following other parameters can be
set: ``videoDefinition``, ``videoDimension``, ``videoDuration``, ``videoLicense``,
``videoEmbeddable``, ``videoSyndicated``, ``videoType``.
forDeveloper : bool
This parameter can only be used in a properly authorized request. The
``forDeveloper`` parameter restricts the search to only retrieve videos uploaded
via the developer's application or website. The API server uses the request's
authorization credentials to identify the developer. The ``forDeveloper``
parameter can be used in conjunction with optional search parameters like the
``q`` parameter. For this feature, each uploaded video is automatically tagged
with the project number that is associated with the developer's application in the
Google Developers Console. When a search request subsequently sets the
``forDeveloper`` parameter to ``true`` the API server uses the request's
authorization credentials to identify the developer. Therefore, a developer can
restrict results to videos uploaded through the developer's own app or website but
not to videos uploaded through other apps or sites.
forMine : bool
This parameter can only be used in a properly authorized request. The ``forMine``
parameter restricts the search to only retrieve videos owned by the authenticated
user. If you set this parameter to ``true``, then the ``type`` parameter's value
must also be set to ``video``. In addition, none of the following other parameters
can be set in the same request: ``videoDefinition``, ``videoDimension``,
``videoDuration``, ``videoLicense``, ``videoEmbeddable``, ``videoSyndicated``,
``videoType``.
relatedToVideoId: str
The ``relatedToVideoId`` parameter retrieves a list of videos that are related to
the video that the parameter ``value`` identifies. The parameter ``value`` must be
set to a YouTube video ID and, if you are using this parameter, the ``type``
parameter must be set to video.Note that if the ``relatedToVideoId`` parameter is
set, the only other supported parameters are ``part``, ``maxResults``,
``pageToken``, ``regionCode``, ``relevanceLanguage``, ``safeSearch``, ``type``
(which must be set to video), and ``fields``.
location : str
The ``location`` parameter, in conjunction with the ``locationRadius`` parameter,
defines a circular geographic area and also restricts a search to videos that
specify, in their metadata, a geographic location that falls within that area. The
parameter value is a string that specifies latitude/longitude coordinates e.g.
(37.42307,-122.08427).The location parameter value identifies the point at the
center of the area. The ``locationRadius`` parameter specifies the maximum
distance that the location associated with a video can be from that point for the
video to still be included in the search results. The API returns an error if your
request specifies a value for the ``location`` parameter but does not also specify
a value for the ``locationRadius`` parameter.
locationRadius : str
The ``locationRadius`` parameter, in conjunction with the ``location`` parameter,
defines a circular geographic area. The parameter value must be a floating point
number followed by a measurement unit. Valid measurement units are m, km, ft, and
mi. For example, valid parameter values include 1500m, 5km, 10000ft, and 0.75mi.
The API does not support ``locationRadius`` parameter values larger than 1000
kilometers. Note: See the definition of the ``location`` parameter for more
information.
maxResults : int
The ``maxResults`` parameter specifies the maximum number of items that should be
returned in the result set. Acceptable values are 0 to 50, inclusive. The default
value is 5.
onBehalfOfContentOwner : str
This parameter can only be used in a properly authorized request. Note: This
parameter is intended exclusively for YouTube content partners.The
``onBehalfOfContentOwner`` parameter indicates that the request's authorization
credentials identify a YouTube CMS user who is acting on behalf of the content
owner specified in the parameter value. This parameter is intended for YouTube
content partners that own and manage many different YouTube channels. It allows
content owners to authenticate once and get access to all their video and channel
data, without having to provide authentication credentials for each individual
channel. The CMS account that the user authenticates with must be linked to the
specified YouTube content owner.
order : str
The order parameter specifies the method that will be used to order resources in
the API response. The default value is relevance. Acceptable values are:
date - Resources are sorted in reverse chronological order based on the
date they were created.
rating - Resources are sorted from highest to lowest rating.
relevance - Resources are sorted based on their relevance to the search
query. This is the default value for this parameter.
title - Resources are sorted alphabetically by title.
videoCount - Channels are sorted in descending order of their number of
uploaded videos.
viewCount - Resources sorted from highest to lowest number of views.
For live broadcasts, videos are sorted by number of concurrent viewers
while the broadcasts are ongoing.
pageToken : str
The ``pageToken`` parameter identifies a specific page in the result set that
should be returned. In an API response, the ``nextPageToken`` and
``prevPageToken`` properties identify other pages that could be retrieved.
publishedAfter : datetime
The ``publishedAfter`` parameter indicates that the API response should only
contain resources created at or after the specified time. The value is an RFC 3339
formatted date-time value (1970-01-01T00:00:00Z).
publishedBefore : datetime
The ``publishedBefore`` parameter indicates that the API response should only
contain resources created before or at the specified time. The value is an RFC
3339 formatted date-time value (1970-01-01T00:00:00Z).
regionCode : str
The ``regionCode`` parameter instructs the API to return search results for videos
that can be viewed in the specified country. The parameter value is an ISO 3166-1
alpha-2 country code.
relevanceLanguage : str
The ``relevanceLanguage`` parameter instructs the API to return search results
that are most relevant to the specified language. The parameter value is typically
an ISO 639-1 two-letter language code. However, you should use the values zh-Hans
for simplified Chinese and zh-Hant for traditional Chinese. Please note that
results in other languages will still be returned if they are highly relevant to
the search query term.
safeSearch : str
The ``safeSearch`` parameter indicates whether the search results should include
restricted content as well as standard content. Acceptable values are:
moderate - YouTube will filter some content from search results and,
at the least, will filter content that is restricted in your locale.
Based on their content, search results could be removed from search
results or demoted in search results. This is the default parameter
value.
none - YouTube will not filter the search result set.
strict - YouTube will try to exclude all restricted content from the
search result set.
Based on their content, search results
could be removed from search results or demoted in search
results.
topicId : str
The ``topicId`` parameter indicates that the API response should only contain
resources associated with the specified topic. The value identifies a Freebase
topic ID.
type : str
The ``type`` parameter restricts a search query to only retrieve a particular type
of resource. The value is a comma-separated list of resource types. The default
value is video,channel,playlist. Acceptable values are: channel, playlist, and
video.
videoCaption : str
The ``videoCaption`` parameter indicates whether the API should filter video
search results based on whether they have captions. If you specify a value for
this parameter, you must also set the ``type`` parameter's value to video.
Acceptable values are:
any - Do not filter results based on caption availability.
closedCaption - Only include videos that have captions.
none - Only include videos that do not have captions.
videoCategoryId : str
The ``videoCategoryId`` parameter filters video search results based on their
category. If you specify a value for this parameter, you must also set the
``type`` parameter's value to video.
videoDefinition : str
The ``videoDefinition`` parameter lets you restrict a search to only include
either high definition (HD) or standard definition (SD) videos. HD videos are
available for playback in at least 720p, though higher resolutions, like 1080p,
might also be available. If you specify a value for this parameter, you must also
set the ``type`` parameter's value to video. Acceptable values are:
any - Return all videos, regardless of their resolution.
high - Only retrieve HD videos.
standard - Only retrieve videos in standard definition.
videoDimension : str
The ``videoDimension`` parameter lets you restrict a search to only retrieve 2D or
3D videos. If you specify a value for this parameter, you must also set the
``type`` parameter's value to video. Acceptable values are:
2d - Restrict search results to exclude 3D videos.
3d - Restrict search results to only include 3D videos.
any - Include both 3D and non-3D videos in returned results. This is the default
value.
videoDuration : str
The ``videoDuration`` parameter filters video search results based on their
duration. If you specify a value for this parameter, you must also set the
``type`` parameter's value to video. Acceptable values are:
any - Do not filter video search results based on their duration.
This is the default value.
long - Only include videos longer than 20 minutes.
medium - Only include videos that are between four and 20 minutes
long (inclusive).
short - Only include videos that are less than four minutes long.
videoEmbeddable : str
The ``videoEmbeddable`` parameter lets you to restrict a search to only videos
that can be embedded into a webpage. If you specify a value for this parameter,
you must also set the ``type`` parameter's value to video. Acceptable values are:
any - Return all videos, embeddable or not.
true - Only retrieve embeddable videos.
videoLicense : str
The ``videoLicense`` parameter filters search results to only include videos with
a particular license. YouTube lets video uploaders choose to attach either the
Creative Commons license or the standard YouTube license to each of their videos.
If you specify a value for this parameter, you must also set the ``type``
parameter's value to video. Acceptable values are:
any - Return all videos, regardless of which license they have,
that match the query parameters.
creativeCommon - Only return videos that have a Creative Commons
license.
Users can reuse videos with this license in other videos that they
create.
youtube - Only return videos that have the standard YouTube license.
videoSyndicated : str
The ``videoSyndicated`` parameter lets you to restrict a search to only videos
that can be played outside youtube.com. If you specify a value for this parameter,
you must also set the ``type`` parameter's value to video. Acceptable values are:
any - Return all videos, syndicated or not.
true - Only retrieve syndicated videos.
videoType : str
The ``videoType`` parameter lets you restrict a search to a particular type of
videos. If you specify a value for this parameter, you must also set the ``type``
parameter's value to video. Acceptable values are:
any - Return all videos.
episode - Only retrieve episodes of shows.
movie - Only retrieve movies.
Returns
-------
serp_df : pandas.DataFrame
"""
params = locals()
supplied_params = {k: v for k, v in params.items() if params[k]}
type_vid_params = {
"eventType",
"relatedToVideoId",
"videoCaption",
"videoCategoryId",
"videoDefinition",
"videoDimension",
"videoDuration",
"videoEmbeddable",
"videoLicense",
"videoSyndicated",
"videoType",
"forMine",
"forContentOwner",
}
if supplied_params.get("type") != "video" and type_vid_params.intersection(
set(supplied_params.keys())
):
raise ValueError(
'You need to set type="video" if you want to set'
" any of the following:" + str(type_vid_params)
)
for p in supplied_params:
if isinstance(supplied_params[p], (str, int)):
supplied_params[p] = [supplied_params[p]]
for p in supplied_params:
if p in SERP_YTUBE_VALID_VALS:
if not set(supplied_params[p]).issubset(SERP_YTUBE_VALID_VALS[p]):
raise ValueError(
"Please make sure you provide a"
' valid value for "{}", valid values:\n{}'.format(
p, sorted([str(x) for x in SERP_YTUBE_VALID_VALS[p]])
)
)
params_list = _dict_product(supplied_params)
base_url = "https://www.googleapis.com/youtube/v3/search?part=snippet"
responses = []
for param in params_list:
param_log = ", ".join([k + "=" + str(v) for k, v in param.items()])
logging.info(msg="Requesting: " + param_log)
resp = requests.get(base_url, params=param)
if resp.status_code >= 400:
raise Exception(resp.json())
responses.append(resp)
result_df = pd.DataFrame()
for i, resp in enumerate(responses):
snippet_df = pd.DataFrame([x["snippet"] for x in resp.json()["items"]])
id_df = pd.DataFrame([x["id"] for x in resp.json()["items"]])
if "channelId" in id_df:
id_df = id_df.drop("channelId", axis=1)
if "thumbnails" in snippet_df:
thumb_df = json_normalize(snippet_df["thumbnails"])
else:
thumb_df = pd.DataFrame()
page_info = resp.json()["pageInfo"]
temp_df = pd.concat([snippet_df, id_df, thumb_df], axis=1).assign(**page_info)
temp_df["rank"] = range(1, len(temp_df) + 1)
if len(temp_df) == 0:
empty_df_cols = [
"title",
"description",
"publishedAt",
"channelTitle",
"kind",
"videoId",
"channelId",
]
temp_df = temp_df.assign(q=[params_list[i]["q"]])
temp_df = temp_df.assign(**dict.fromkeys(empty_df_cols))
temp_df = temp_df.assign(**page_info)
del params_list[i]["key"]
temp_df = temp_df.assign(**params_list[i])
temp_df["nextPageToken"] = resp.json().get("nextPageToken")
result_df = pd.concat([result_df, temp_df], sort=False, ignore_index=True)
result_df["queryTime"] = datetime.datetime.now(tz=datetime.timezone.utc)
result_df["queryTime"] = pd.to_datetime(result_df["queryTime"])
specified_cols = [
"queryTime",
"rank",
"title",
"description",
"publishedAt",
"channelTitle",
"totalResults",
"kind",
]
ordered_cols = list(params_list[i].keys()) + specified_cols
non_ordered = result_df.columns.difference(set(ordered_cols))
final_df = result_df[ordered_cols + list(non_ordered)]
vid_ids = ",".join(final_df["videoId"].dropna())
if vid_ids:
vid_details_df = youtube_video_details(vid_ids=vid_ids, key=key)
vid_details_df.columns = ["video." + x for x in vid_details_df.columns]
final_df = pd.merge(
final_df, vid_details_df, how="left", left_on="videoId", right_on="video.id"
)
channel_ids = ",".join(final_df["channelId"].dropna())
if channel_ids:
channel_details_df = youtube_channel_details(channel_ids=channel_ids, key=key)
channel_details_df.columns = [
"channel." + x for x in channel_details_df.columns
]
final_df = pd.merge(
final_df,
channel_details_df,
how="left",
left_on="channelId",
right_on="channel.id",
)
final_df = final_df.drop_duplicates(subset=["videoId"])
return final_df.reset_index(drop=True)
[docs]
def set_logging_level(level_or_name):
"""Change the logging level during the session.
Acceptable values are [0, 10, 20, 30, 40, 50,
'NOTSET', 'DEBUG', 'INFO', 'WARNING', 'ERROR',
'CRITICAL']
"""
lvl_names_values = [
0,
10,
20,
30,
40,
50,
"NOTSET",
"DEBUG",
"INFO",
"WARNING",
"ERROR",
"CRITICAL",
]
if level_or_name not in lvl_names_values:
raise ValueError(
"Please make sure you supply" " a value from: {}".format(lvl_names_values)
)
logging.getLogger().setLevel(level_or_name)
logging.getLogger().setLevel("INFO")