1
import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
import time
from google.cloud import storage
import io
import re
import requests
import gcsfs
def bq_date(x):
if len(str(x.day)) == 1:
day = "0" + str(x.day)
else:
day = str(x.day)
if len(str(x.month)) == 1:
month = "0" + str(x.month)
else:
month = str(x.month)
return "{0}{1}{2}".format(x.year, month, day)
def list_gcs_objs(bucket, prefix):
storage_client = storage.Client()
bucket_check = storage_client.get_bucket(bucket)
blob_list = list(bucket_check.list_blobs(prefix=prefix))
obj_paths = list()
if len(blob_list) <= 1:
print("Folder empty\n")
return obj_paths
else:
count = 1
while count < len(blob_list):
obj_paths.append(blob_list[count].name)
count += 1
return obj_paths
def upload_to_gcs(bucket, object_key, data):
storage_client = storage.Client(bucket)
bucket_up = storage_client.get_bucket(bucket)
blob_up = bucket_up.blob(object_key)
response = blob_up.upload_from_string(data)
return (response)
def getInstagramStoriesFeed(base_url):
fields = "?fields=id,caption,media_type,media_url,permalink,timestamp,username"
return base_url + fields
def getStories_Insights(post_id, access_token):
base = "https://graph.facebook.com/v3.2/"
arequest = "{0}/insights?access_token={1}&metric=".format(post_id, access_token) + \
"impressions,reach,replies,exits,taps_forward,taps_back"
return base + arequest
def scrapeInstagramStories(page_id, access_token, since_date, until_date):
global ndf, sf
from datetime import date
scrape_starttime = date.today()
base_url = "https://graph.facebook.com/v3.2"
node = "/{}/stories?fields=".format(page_id)
fields = "id,caption,media_type,permalink,timestamp,username"
parameters = "&limit=100&access_token={0}".format(access_token)
anchor = since_date
after = ''
print("Scraping {} Instagram Page: {}\n".format(page_id, scrape_starttime))
ctr = 1
count = 0
while (anchor >= since_date) & (count < 10):
after = '' if after is '' else "&after={}".format(after)
url = base_url + node + fields + parameters + after
content = requests.get(url).json()
sf = pd.DataFrame.from_dict(content['data'])
if len(sf) > 0:
sf['timestamp'] = pd.to_datetime(sf.timestamp, infer_datetime_format=True)
sf['data'] = sf.timestamp.apply(lambda x: x.date())
anchor = sf.data.min()
# if there is no next page, we're done.
if 'paging' in content:
after = content['paging']['cursors']['after']
if ctr == 1:
ndf = sf.copy()
else:
ndf = pd.concat([sf, ndf], sort=False)
ctr += 1
else:
pass
count += 1
if ndf.empty:
return ndf
date_index = list(pd.date_range(start=since_date, end=until_date))
ndf['timestamp'] = pd.to_datetime(ndf.timestamp, infer_datetime_format=True)
ndf['data'] = ndf.timestamp.apply(lambda x: x.date())
ndf['data'] = np.where(ndf.data.isin(date_index),
np.nan,
ndf.data)
ndf.dropna(subset=['data'], inplace=True)
ndf['data'] = pd.to_datetime(ndf.data, infer_datetime_format=True)
impressions = {}
reach = {}
replies = {}
exits = {}
tf = {}
tb = {}
for post in ndf.id.unique():
aux_url = getStories_Insights(post, access_token)
insights = requests.get(aux_url).json()
if len(insights) > 0:
impressions.update({post: insights['data'][0]['values'][0]['value']})
reach.update({post: insights['data'][1]['values'][0]['value']})
replies.update({post: insights['data'][2]['values'][0]['value']})
exits.update({post: insights['data'][3]['values'][0]['value']})
tf.update({post: insights['data'][4]['values'][0]['value']})
tb.update({post: insights['data'][5]['values'][0]['value']})
else:
impressions.update({post: 0})
reach.update({post: 0})
replies.update({post: 0})
exits.update({post: 0})
tf.update({post: 0})
tb.update({post: 0})
ndf['impressions'] = ndf.id.map(impressions)
ndf['reach'] = ndf.id.map(reach)
ndf['replies'] = ndf.id.map(replies)
ndf['exits'] = ndf.id.map(exits)
ndf['taps_forward'] = ndf.id.map(tf)
ndf['taps_back'] = ndf.id.map(tb)
ndf['id'] = ndf.id.astype('category')
# ndf['caption'] = ndf.caption.astype('category')
# ndf['media_type'] = ndf.media_type.astype('category')
# ndf['permalink'] = ndf.permalink.astype('category')
# ndf['username'] = ndf.username.astype('category')
ndf['impressions'] = ndf.impressions.astype('int64')
ndf['reach'] = ndf.reach.astype('int64')
ndf['replies'] = ndf.replies.astype('int64')
ndf['exits'] = ndf.exits.astype('int64')
ndf['taps_forward'] = ndf.taps_forward.astype('int64')
ndf['taps_back'] = ndf.taps_back.astype('int64')
return ndf
def st_scrapper(request):
global ndf
since_date = (date.today() - timedelta(1))
until_date = date.today()
today = date.today().strftime("%Y%m%d")
mybucket = "gdata-dn-gshow-sandbox"
mainprefix = "AD/INS/"
# List FB Avaiable Data
maindata = list_gcs_objs(mybucket, mainprefix)
ins_dates = [x[-12:-4] for x in maindata]
# Queries
query_tags = "SELECT * FROM `globoid.AD_gshow_hashtags`"
dtags = pd.read_gbq(query_tags, dialect='standard', index_col="Hashtag")
tags = dtags['Produto'].to_dict()
user_token = ""
gshow_pages = {'',
}
gshow_pagelist = list(gshow_pages.keys())
ctr = 1
for page in gshow_pagelist:
print("{0} | Date Range {1} - {2}".format(gshow_pages[page],
since_date.strftime("%Y-%m-%d"),
until_date.strftime("%Y-%m-%d"))
)
if ctr == 1:
df = scrapeInstagramStories(page_id=page, access_token=user_token,
since_date=since_date, until_date=until_date)
if df.empty:
print("{0} is empty!".format(gshow_pages[page]))
pass
else:
ndf = df.copy()
ctr += 1
else:
df = scrapeInstagramStories(page_id=page, access_token=user_token,
since_date=since_date, until_date=until_date)
if df.empty:
print("{0} is empty!".format(gshow_pages[page]))
pass
else:
ndf = pd.concat([ndf, df], sort=False)
ctr += 1
ndf['timestamp'] = pd.to_datetime(ndf.timestamp, infer_datetime_format=True)
ndf['timestamp'] = ndf.timestamp.dt.tz_localize('UTC')
ndf['timestamp'] = ndf.timestamp.dt.tz_convert('America/Sao_Paulo')
ndf.caption.fillna("None", inplace=True)
ndf['completion_rate'] = 1 - (ndf.exits + ndf.taps_forward - ndf.taps_back) / (ndf.impressions)
ndf.drop('data', axis=1, inplace=True)
ndf['hashtag'] = ndf.caption.apply(lambda x: "#" + re.findall(r"#(\w+)", x)[0]
if len(re.findall(r"#(\w+)", x)) > 0
else "None")
ndf['hashtag2'] = ndf.caption.apply(lambda x: "#" + re.findall(r"#(\w+)", x)[1]
if len(re.findall(r"#(\w+)", x)) > 1
else "None")
pagen = {'bbb': 'Big Brother Brasil', 'gshow': 'GSHOW',
'caldeiraodohuck': 'Caldeirão do Huck', 'oficialzorra': 'Zorra',
'popstar': 'Popstar', 'thevoicebrasil': 'The Voice Brasil',
'conversacombial': 'Conversa com Bial', 'malhacao': 'Malhação - Toda Forma de Amar'}
ndf['produto'] = np.where(ndf.username == "gshow", ndf.hashtag.map(tags), np.nan)
ndf['produto'] = np.where((ndf.username == "gshow") & (ndf.produto.isnull()),
ndf.hashtag2.map(tags), ndf.produto)
ndf['produto'] = np.where((ndf.username != "gshow") & (ndf.produto.isnull()),
ndf.username.map(pagen), ndf.produto)
ndf['produto'] = np.where(ndf.produto.isnull(), "GSHOW", ndf.produto)
ndf.drop(['hashtag', 'hashtag2'], axis=1, inplace=True)
ndf['caption'] = ndf.caption.astype('category')
ndf['id'] = ndf.id.astype('category')
ndf['media_type'] = ndf.media_type.astype('category')
ndf['media_url'] = ndf.media_url.astype('category')
ndf['permalink'] = ndf.permalink.astype('category')
ndf['username'] = ndf.username.astype('category')
ndf['produto'] = ndf.produto.astype('category')
ndf = ndf[['timestamp', 'username', 'id', 'caption', 'permalink', 'media_type', 'impressions',
'video_views', 'like_count', 'comments_count', 'saved', 'reach', 'interactions', 'produto']]
ndf['timestamp'] = ndf.timestamp.apply(bq_date)
ndf['timestamp'] = ndf.timestamp.astype("category")
df = ndf.copy()
for dtx in df[df.timestamp != today].timestamp.unique():
print(dtx)
s = io.StringIO()
aux = df[df.timestamp == dtx].copy()
aux.to_csv(s, sep=",", encoding="utf-8", index=False)
response = upload_to_gcs('gdata-dn-gshow-sandbox',
'AD/INS/instagram_stories_ad_{0}.csv'.format(dtx),
s.getvalue())
print("{0} - {1}".format(dtx, response), end="\r", flush=True)
ERROR FOUND
nsertId: "000000-df33f5d7-a462-4357-bd60-15a52f1b66c5" Labels: {...} logname: "Projects/gdata-Dn-Gshow-sandbox/logs/cloudfunctions.googleapis.com%2Fcloud-functions" receiveTimestamp: "2019-08-07T17:39:07.365346300Z" Resource: {...}
Severity: "ERROR" textPayload: "Traceback (Most recent call last): File "/env/local/lib/python3.7/site-Packages/google/cloud/functions/worker.py", line 346, in run_http_function result = _function_handler.invoke_user_function(flask.request) File "/env/local/lib/python3.7/site-Packages/google/cloud/functions/worker.py", line 217, in invoke_user_function Return call_user_function(request_or_event) File "/env/local/lib/python3.7/site-Packages/google/cloud/functions/worker.py", line 210, in call_user_function Return self. _user_function(request_or_event) File "/user_code/main.py", line 262, in st_scrapper ndf['media_url'] = ndf.media_url.astype('Category') File "/env/local/lib/python3.7/site-Packages/pandas/core/Generic.py", line 5180, in getattr Return Object.getattribute(self, name) Attributeerror: 'Dataframe' Object has no attribute 'media_url' " timestamp: "2019-08-07T17:39:01.147Z" trace: "Projects/gdata-Dn-Gshow-sandbox/Traces/a1448d780d796c9f70f4903775329c7
7"