I found the Attributeerror error: 'Dataframe' Object has no attribute 'media_url' how can I adjust it? thanks

Asked

Viewed 56 times

1

import pandas as pd
import numpy as np
from datetime import datetime, date, timedelta
import time
from google.cloud import storage
import io
import re
import requests
import gcsfs


def bq_date(x):
    if len(str(x.day)) == 1:
        day = "0" + str(x.day)
    else:
        day = str(x.day)

    if len(str(x.month)) == 1:
        month = "0" + str(x.month)
    else:
        month = str(x.month)

    return "{0}{1}{2}".format(x.year, month, day)


def list_gcs_objs(bucket, prefix):
    storage_client = storage.Client()
    bucket_check = storage_client.get_bucket(bucket)
    blob_list = list(bucket_check.list_blobs(prefix=prefix))
    obj_paths = list()
    if len(blob_list) <= 1:
        print("Folder empty\n")
        return obj_paths
    else:
        count = 1
    while count < len(blob_list):
        obj_paths.append(blob_list[count].name)
        count += 1
    return obj_paths


def upload_to_gcs(bucket, object_key, data):
    storage_client = storage.Client(bucket)
    bucket_up = storage_client.get_bucket(bucket)
    blob_up = bucket_up.blob(object_key)
    response = blob_up.upload_from_string(data)
    return (response)


def getInstagramStoriesFeed(base_url):
    fields = "?fields=id,caption,media_type,media_url,permalink,timestamp,username"
    return base_url + fields


def getStories_Insights(post_id, access_token):
    base = "https://graph.facebook.com/v3.2/"
    arequest = "{0}/insights?access_token={1}&metric=".format(post_id, access_token) + \
               "impressions,reach,replies,exits,taps_forward,taps_back"
    return base + arequest


def scrapeInstagramStories(page_id, access_token, since_date, until_date):
    global ndf, sf
    from datetime import date
    scrape_starttime = date.today()
    base_url = "https://graph.facebook.com/v3.2"
    node = "/{}/stories?fields=".format(page_id)
    fields = "id,caption,media_type,permalink,timestamp,username"
    parameters = "&limit=100&access_token={0}".format(access_token)
    anchor = since_date
    after = ''

    print("Scraping {} Instagram Page: {}\n".format(page_id, scrape_starttime))
    ctr = 1
    count = 0
    while (anchor >= since_date) & (count < 10):
        after = '' if after is '' else "&after={}".format(after)
        url = base_url + node + fields + parameters + after

        content = requests.get(url).json()
        sf = pd.DataFrame.from_dict(content['data'])

        if len(sf) > 0:
            sf['timestamp'] = pd.to_datetime(sf.timestamp, infer_datetime_format=True)
            sf['data'] = sf.timestamp.apply(lambda x: x.date())
            anchor = sf.data.min()

        # if there is no next page, we're done.
        if 'paging' in content:
            after = content['paging']['cursors']['after']
            if ctr == 1:
                ndf = sf.copy()
            else:
                ndf = pd.concat([sf, ndf], sort=False)
            ctr += 1
        else:
            pass

        count += 1

    if ndf.empty:
        return ndf

    date_index = list(pd.date_range(start=since_date, end=until_date))
    ndf['timestamp'] = pd.to_datetime(ndf.timestamp, infer_datetime_format=True)
    ndf['data'] = ndf.timestamp.apply(lambda x: x.date())

    ndf['data'] = np.where(ndf.data.isin(date_index),
                           np.nan,
                           ndf.data)

    ndf.dropna(subset=['data'], inplace=True)
    ndf['data'] = pd.to_datetime(ndf.data, infer_datetime_format=True)

    impressions = {}
    reach = {}
    replies = {}
    exits = {}
    tf = {}
    tb = {}

    for post in ndf.id.unique():
        aux_url = getStories_Insights(post, access_token)
        insights = requests.get(aux_url).json()
        if len(insights) > 0:
            impressions.update({post: insights['data'][0]['values'][0]['value']})
            reach.update({post: insights['data'][1]['values'][0]['value']})
            replies.update({post: insights['data'][2]['values'][0]['value']})
            exits.update({post: insights['data'][3]['values'][0]['value']})
            tf.update({post: insights['data'][4]['values'][0]['value']})
            tb.update({post: insights['data'][5]['values'][0]['value']})
        else:
            impressions.update({post: 0})
            reach.update({post: 0})
            replies.update({post: 0})
            exits.update({post: 0})
            tf.update({post: 0})
            tb.update({post: 0})

    ndf['impressions'] = ndf.id.map(impressions)
    ndf['reach'] = ndf.id.map(reach)
    ndf['replies'] = ndf.id.map(replies)
    ndf['exits'] = ndf.id.map(exits)
    ndf['taps_forward'] = ndf.id.map(tf)
    ndf['taps_back'] = ndf.id.map(tb)

    ndf['id'] = ndf.id.astype('category')
    # ndf['caption'] = ndf.caption.astype('category')
    # ndf['media_type'] = ndf.media_type.astype('category')
    # ndf['permalink'] = ndf.permalink.astype('category')
    # ndf['username'] = ndf.username.astype('category')

    ndf['impressions'] = ndf.impressions.astype('int64')
    ndf['reach'] = ndf.reach.astype('int64')
    ndf['replies'] = ndf.replies.astype('int64')
    ndf['exits'] = ndf.exits.astype('int64')
    ndf['taps_forward'] = ndf.taps_forward.astype('int64')
    ndf['taps_back'] = ndf.taps_back.astype('int64')

    return ndf


def st_scrapper(request):
    global ndf
    since_date = (date.today() - timedelta(1))
    until_date = date.today()
    today = date.today().strftime("%Y%m%d")

    mybucket = "gdata-dn-gshow-sandbox"
    mainprefix = "AD/INS/"

    # List FB Avaiable Data

    maindata = list_gcs_objs(mybucket, mainprefix)
    ins_dates = [x[-12:-4] for x in maindata]

    # Queries
    query_tags = "SELECT * FROM `globoid.AD_gshow_hashtags`"

    dtags = pd.read_gbq(query_tags, dialect='standard', index_col="Hashtag")
    tags = dtags['Produto'].to_dict()

    user_token = ""

    gshow_pages = {'',
                   }

    gshow_pagelist = list(gshow_pages.keys())

    ctr = 1
    for page in gshow_pagelist:
        print("{0} | Date Range {1} - {2}".format(gshow_pages[page],
                                                  since_date.strftime("%Y-%m-%d"),
                                                  until_date.strftime("%Y-%m-%d"))
              )
        if ctr == 1:
            df = scrapeInstagramStories(page_id=page, access_token=user_token,
                                        since_date=since_date, until_date=until_date)

            if df.empty:
                print("{0} is empty!".format(gshow_pages[page]))
                pass
            else:
                ndf = df.copy()

            ctr += 1
        else:
            df = scrapeInstagramStories(page_id=page, access_token=user_token,
                                        since_date=since_date, until_date=until_date)

            if df.empty:
                print("{0} is empty!".format(gshow_pages[page]))
                pass
            else:
                ndf = pd.concat([ndf, df], sort=False)

            ctr += 1

    ndf['timestamp'] = pd.to_datetime(ndf.timestamp, infer_datetime_format=True)
    ndf['timestamp'] = ndf.timestamp.dt.tz_localize('UTC')
    ndf['timestamp'] = ndf.timestamp.dt.tz_convert('America/Sao_Paulo')

    ndf.caption.fillna("None", inplace=True)
    ndf['completion_rate'] = 1 - (ndf.exits + ndf.taps_forward - ndf.taps_back) / (ndf.impressions)
    ndf.drop('data', axis=1, inplace=True)

    ndf['hashtag'] = ndf.caption.apply(lambda x: "#" + re.findall(r"#(\w+)", x)[0]
    if len(re.findall(r"#(\w+)", x)) > 0
    else "None")

    ndf['hashtag2'] = ndf.caption.apply(lambda x: "#" + re.findall(r"#(\w+)", x)[1]
    if len(re.findall(r"#(\w+)", x)) > 1
    else "None")

    pagen = {'bbb': 'Big Brother Brasil', 'gshow': 'GSHOW',
             'caldeiraodohuck': 'Caldeirão do Huck', 'oficialzorra': 'Zorra',
             'popstar': 'Popstar', 'thevoicebrasil': 'The Voice Brasil',
             'conversacombial': 'Conversa com Bial', 'malhacao': 'Malhação - Toda Forma de Amar'}

    ndf['produto'] = np.where(ndf.username == "gshow", ndf.hashtag.map(tags), np.nan)
    ndf['produto'] = np.where((ndf.username == "gshow") & (ndf.produto.isnull()),
                              ndf.hashtag2.map(tags), ndf.produto)

    ndf['produto'] = np.where((ndf.username != "gshow") & (ndf.produto.isnull()),
                              ndf.username.map(pagen), ndf.produto)

    ndf['produto'] = np.where(ndf.produto.isnull(), "GSHOW", ndf.produto)
    ndf.drop(['hashtag', 'hashtag2'], axis=1, inplace=True)

    ndf['caption'] = ndf.caption.astype('category')
    ndf['id'] = ndf.id.astype('category')
    ndf['media_type'] = ndf.media_type.astype('category')
    ndf['media_url'] = ndf.media_url.astype('category')
    ndf['permalink'] = ndf.permalink.astype('category')
    ndf['username'] = ndf.username.astype('category')
    ndf['produto'] = ndf.produto.astype('category')

    ndf = ndf[['timestamp', 'username', 'id', 'caption', 'permalink', 'media_type', 'impressions',
               'video_views', 'like_count', 'comments_count', 'saved', 'reach', 'interactions', 'produto']]

    ndf['timestamp'] = ndf.timestamp.apply(bq_date)
    ndf['timestamp'] = ndf.timestamp.astype("category")

    df = ndf.copy()

    for dtx in df[df.timestamp != today].timestamp.unique():
        print(dtx)

        s = io.StringIO()
        aux = df[df.timestamp == dtx].copy()
        aux.to_csv(s, sep=",", encoding="utf-8", index=False)

        response = upload_to_gcs('gdata-dn-gshow-sandbox',
                                 'AD/INS/instagram_stories_ad_{0}.csv'.format(dtx),
                                 s.getvalue())

        print("{0} - {1}".format(dtx, response), end="\r", flush=True)

ERROR FOUND

nsertId: "000000-df33f5d7-a462-4357-bd60-15a52f1b66c5" Labels: {...} logname: "Projects/gdata-Dn-Gshow-sandbox/logs/cloudfunctions.googleapis.com%2Fcloud-functions" receiveTimestamp: "2019-08-07T17:39:07.365346300Z" Resource: {...}
Severity: "ERROR" textPayload: "Traceback (Most recent call last): File "/env/local/lib/python3.7/site-Packages/google/cloud/functions/worker.py", line 346, in run_http_function result = _function_handler.invoke_user_function(flask.request) File "/env/local/lib/python3.7/site-Packages/google/cloud/functions/worker.py", line 217, in invoke_user_function Return call_user_function(request_or_event) File "/env/local/lib/python3.7/site-Packages/google/cloud/functions/worker.py", line 210, in call_user_function Return self. _user_function(request_or_event) File "/user_code/main.py", line 262, in st_scrapper ndf['media_url'] = ndf.media_url.astype('Category') File "/env/local/lib/python3.7/site-Packages/pandas/core/Generic.py", line 5180, in getattr Return Object.getattribute(self, name) Attributeerror: 'Dataframe' Object has no attribute 'media_url' " timestamp: "2019-08-07T17:39:01.147Z" trace: "Projects/gdata-Dn-Gshow-sandbox/Traces/a1448d780d796c9f70f4903775329c7

7"

No answers

Browser other questions tagged

You are not signed in. Login or sign up in order to post.