In bs4 get text between elements
In bs4 get text between elements
HTML Source:
<script type="text/javascript">window._sharedData = {"activity_counts":null,"config":"csrf_token":"P8DvqEB5AxkRuWyoNWhrZ3Bi2scbrVm9","viewer":null,"supports_es6":true,"country_code":"NL","language_code":"en","locale":"en_US","entry_data":{"ProfilePage":[{"logging_page_id":"profilePage_4469324900","show_suggested_profiles":false,"graphql":{"user":{"biography":"","blocked_by_viewer":false,"country_block":false,"external_url":null,"external_url_linkshimmed":null,"edge_followed_by":"count":143,"followed_by_viewer":false,"edge_follow":"count":43,"follows_viewer":false,"full_name":"u0627u062cu0627u0631u0647 u0648u06ccu0644u0627 u062fu0631 u06afu0631u062fu0646u0647 .................. ;</script>
<script type="text/javascript">
(function() $)');
var className = docElement.className;
docElement.className = className.replace(classRE, '$1js$2');
)();
</script>
Now i want output show JUST every thing after window._sharedData =
window._sharedData =
Output:
{"activity_counts":null,"config":"csrf_token":"P8DvqEB5AxkRuWyoNWhrZ3Bi2scbrVm9","viewer":null,"supports_es6":true,"count .......`
Here is my code :
url = 'https://www.instagram.com/mehran_eblaghi/'
s = requests.session()
soup = bs(s.get(url).text, 'html.parser').findAll('script')
print(soup)
Have you already parsed the source by using beautifulsoup, searched for the first script tag and then taken its text as a starting point?
– Jon Clements♦
Sep 8 '18 at 9:21
@JonClements yes but is parsed all script
– Andy Peterson
Sep 8 '18 at 9:26
use get_text()
– KaiserKatze
Sep 8 '18 at 9:26
That's what
findAll
does... if it's the first script you're after then you can do something like soup = bs(s.get(url).text, 'html.parser')
then work with that object, data = soup.select_one('script').text.partition(' = ')[2]
might work (untested) ?– Jon Clements♦
Sep 8 '18 at 9:27
findAll
soup = bs(s.get(url).text, 'html.parser')
data = soup.select_one('script').text.partition(' = ')[2]
1 Answer
1
Use bs4 to find the first script tag whose text starts with what you're looking for and then take the text content of that and split the start of it, eg:
import json
import requests
import bs4
key = 'window._sharedData = '
soup = bs4.BeautifulSoup(requests.get(url).text, 'html.parser')
script_tag = soup.find('script', text=lambda L: L and L.startswith(key))
if script_tag:
# raw string of data in script
text_data = script_tag.text.partition(key)[2]
# remove the trailing ; and you've json data... interpret as such
data = json.loads(text_data.rstrip(';n'))
else:
# didn't find a match - up to you what to do here...
If you managed to find the relevant script tag, then data
will be a Python dictionary of:
data
'activity_counts': None,
'config': 'csrf_token': '1Srrhc6GQmmC19TdM3nLFsDOORtJMpCj', 'viewer': None,
'supports_es6': False,
'country_code': 'GB',
'language_code': 'en',
'locale': 'en_US',
'entry_data': 'ProfilePage': ['logging_page_id': 'profilePage_4469324900',
'show_suggested_profiles': False,
'graphql': 'user': 'biography': '',
'blocked_by_viewer': False,
'country_block': False,
'external_url': None,
'external_url_linkshimmed': None,
'edge_followed_by': 'count': 143,
'followed_by_viewer': False,
'edge_follow': 'count': 43,
'follows_viewer': False,
'full_name': 'اجاره ویلا در گردنه حیران',
'has_channel': False,
'has_blocked_viewer': False,
'highlight_reel_count': 0,
'has_requested_viewer': False,
'id': '4469324900',
'is_business_account': False,
'is_private': False,
'is_verified': False,
'edge_mutual_followed_by': 'count': 0, 'edges': ,
'profile_pic_url': 'https://scontent-lht6-1.cdninstagram.com/vp/ee763d48bb0c35ac0c6aa22dc1e2ed08/5C31C768/t51.2885-19/s150x150/15876073_1641186492851073_2628164662507601920_n.jpg',
'profile_pic_url_hd': 'https://scontent-lht6-1.cdninstagram.com/vp/fd5c97116848cf46ddf24f8ac8d1fd7e/5C35B210/t51.2885-19/s320x320/15876073_1641186492851073_2628164662507601920_n.jpg',
'requested_by_viewer': False,
'username': 'mehran_eblaghi',
'connected_fb_page': None,
'edge_owner_to_timeline_media': 'count': 2,
'page_info': 'has_next_page': False,
'end_cursor': 'AQBnocogeHdSL1DSSxRdiYR4D1RguUeEj5Ap1do1KIy4U_NutZIe9ZCyRpDExD4TL9k',
'edges': ['node': '__typename': 'GraphImage',
'id': '1429655015362664538',
'edge_media_to_caption': 'edges': ['node': 'text': 'درصورت نیاز به ویلاتماس بگیرید 09112815125'],
'shortcode': 'BPXJ6luDBha',
'edge_media_to_comment': 'count': 10,
'comments_disabled': False,
'taken_at_timestamp': 1484648180,
'dimensions': 'height': 1080, 'width': 1080,
'display_url': 'https://scontent-lht6-1.cdninstagram.com/vp/abeb67556e5e2166e497cc779e99fab2/5C33A30D/t51.2885-15/e35/14597426_594812037376264_3725484886300033024_n.jpg',
'edge_liked_by': 'count': 42,
'edge_media_preview_like': 'count': 42,
'gating_info': None,
'media_preview': 'ACoqZEv32OQcHHpViMrKoJxkE59cVnFC7HB6epP+NSBGhXdkc8e/5+lNRYc0UaYiBO7J9aR3DDg4IOCaoR3IOFwcr1x0wf8AGkF0vJCHB75POP5UrNF3XyNBpARwQe1VN+OKiEqA5AIbr+Hr0q7tVud/Xn7p/wAahptj0RnKcEkVKXwMY/lTEbHNSbt3OK6rbadF1OaMrXXNZ3elr/oN345A5/xpxk56cGoy5JpBKR9KVutvxKUntzO3+H0/p9SSRz0x/wDqqrg1P5mRj161HVxVuljKcru6lf8ACw8DNH61KANo+lMIFGvRrbt/wSLq7unu+v8AwBhIHamnHYfqKdimEClZ919z/wAwutrfl/kKKKYAKkqlfr+Vv1Ynbof/2Q==',
'owner': 'id': '4469324900',
'thumbnail_src': 'https://scontent-lht6-1.cdninstagram.com/vp/a50ea8ec7e91454bc0b981b9a347c2b9/5C2CDBE8/t51.2885-15/sh0.08/e35/s640x640/14597426_594812037376264_3725484886300033024_n.jpg',
'thumbnail_resources': ['src': 'https://scontent-lht6-1.cdninstagram.com/vp/8ecae5da8cdf4f981a29ec7a0c6b0a08/5C30AF4F/t51.2885-15/e35/s150x150/14597426_594812037376264_3725484886300033024_n.jpg',
'config_width': 150,
'config_height': 150,
'src': 'https://scontent-lht6-1.cdninstagram.com/vp/fe3689ac4d9165c32369e8fc460f0040/5C187505/t51.2885-15/e35/s240x240/14597426_594812037376264_3725484886300033024_n.jpg',
'config_width': 240,
'config_height': 240,
'src': 'https://scontent-lht6-1.cdninstagram.com/vp/be7a47d6b422add7f77d597c0eecd21e/5C31FBBF/t51.2885-15/e35/s320x320/14597426_594812037376264_3725484886300033024_n.jpg',
'config_width': 320,
'config_height': 320,
'src': 'https://scontent-lht6-1.cdninstagram.com/vp/2f6d7c80500d9d56f940be6ffa0e8e9a/5C1568E5/t51.2885-15/e35/s480x480/14597426_594812037376264_3725484886300033024_n.jpg',
'config_width': 480,
'config_height': 480,
'src': 'https://scontent-lht6-1.cdninstagram.com/vp/a50ea8ec7e91454bc0b981b9a347c2b9/5C2CDBE8/t51.2885-15/sh0.08/e35/s640x640/14597426_594812037376264_3725484886300033024_n.jpg',
'config_width': 640,
'config_height': 640],
'is_video': False,
'accessibility_caption': None,
'node': '__typename': 'GraphImage',
'id': '1429628539162724247',
'edge_media_to_caption': 'edges': ,
'shortcode': 'BPXD5T1jgeX',
'edge_media_to_comment': 'count': 3,
'comments_disabled': False,
'taken_at_timestamp': 1484645024,
'dimensions': 'height': 1080, 'width': 1080,
'display_url': 'https://scontent-lht6-1.cdninstagram.com/vp/b48766cc9da8d14904f702a927884f5b/5C2B24EA/t51.2885-15/e35/16110374_198276563977954_7548368730246348800_n.jpg',
'edge_liked_by': 'count': 42,
'edge_media_preview_like': 'count': 42,
'gating_info': None,
'media_preview': 'ACoqdDpYeEP0J5wTSHR2C5yPzP8AhVn7YVi45xgYNWbWbzjtII4z1BFVzMjQpxacigHILZ6c/wD1qr6jaFDuwMH0rdBjDYGMjg5pJYVkXDcjNF9bkOCvzLc4dhim5rR1K3EEmByD0rNq7miNRmLIQOScVYsJGik+YEbhgfnn+VY4ncd6kSaR+nJH6etZhaxqyXa/PzyW4/Opri+/dqqn+77dqwZFZTyOvI96aZGHB4x7UrBa5PevvlJ69P5VVp5DPz1pm0+hqwL6wRZ5GB7k/wD6qsRwRxncuc/Wq6E0McEY4osTctCJGYM7EsvTt/jUhhiJ3MN59W5/+tUAozSsFywWUYCgY/pTOKhYZGaQGixLP//Z',
'owner': 'id': '4469324900',
'thumbnail_src': 'https://scontent-lht6-1.cdninstagram.com/vp/d37f58bf9a6bcbe17242a7e0b233c5c0/5C331E0F/t51.2885-15/sh0.08/e35/s640x640/16110374_198276563977954_7548368730246348800_n.jpg',
'thumbnail_resources': ['src': 'https://scontent-lht6-1.cdninstagram.com/vp/f14bd4b53c62c2fe56ba88f1a3ab85cf/5C1DC3A8/t51.2885-15/e35/s150x150/16110374_198276563977954_7548368730246348800_n.jpg',
'config_width': 150,
'config_height': 150,
'src': 'https://scontent-lht6-1.cdninstagram.com/vp/616bc4d9abe790d1c9e06dbb22e7b43f/5C266AE2/t51.2885-15/e35/s240x240/16110374_198276563977954_7548368730246348800_n.jpg',
'config_width': 240,
'config_height': 240,
'src': 'https://scontent-lht6-1.cdninstagram.com/vp/09d6473c69ad0b4e493f05c6d3aad9a4/5C205958/t51.2885-15/e35/s320x320/16110374_198276563977954_7548368730246348800_n.jpg',
'config_width': 320,
'config_height': 320,
'src': 'https://scontent-lht6-1.cdninstagram.com/vp/e5d6902499831040caded69325585dfc/5C350A02/t51.2885-15/e35/s480x480/16110374_198276563977954_7548368730246348800_n.jpg',
'config_width': 480,
'config_height': 480,
'src': 'https://scontent-lht6-1.cdninstagram.com/vp/d37f58bf9a6bcbe17242a7e0b233c5c0/5C331E0F/t51.2885-15/sh0.08/e35/s640x640/16110374_198276563977954_7548368730246348800_n.jpg',
'config_width': 640,
'config_height': 640],
'is_video': False,
'accessibility_caption': None],
'edge_saved_media': 'count': 0,
'page_info': 'has_next_page': False, 'end_cursor': None,
'edges': ,
'edge_media_collections': 'count': 0,
'page_info': 'has_next_page': False, 'end_cursor': None,
'edges': ,
'felix_onboarding_video_resources': 'mp4': '/static/videos/felix-onboarding/onboardingVideo.mp4/9d16838ca7f9.mp4',
'poster': '/static/images/felix-onboarding/onboardingVideoPoster.png/8fdba7cf2120.png'],
'gatekeepers': 'cb': True,
'sf': True,
'ld': True,
'seo': True,
'seoht': True,
'saa': True,
'phone_qp': True,
'knobs': 'acct:ntb': 0, 'cb': 0, 'captcha': 0,
'qe': 'form_navigation_dialog': 'g': '', 'p': ,
'cred_man': 'g': 'test', 'p': 'use_on_landing': 'true',
'iab': 'g': '', 'p': ,
'app_upsell_li': 'g': '', 'p': ,
'app_upsell': 'g': '', 'p': ,
'stale_fix': 'g': '', 'p': ,
'profile_header_name': 'g': '', 'p': ,
'bc3l': 'g': '', 'p': ,
'direct_conversation_reporting': 'g': '', 'p': ,
'general_reporting': 'g': '', 'p': ,
'reporting': 'g': '', 'p': ,
'acc_recovery_link': 'g': '', 'p': ,
'notif': 'g': '', 'p': ,
'fb_unlink': 'g': '', 'p': ,
'mobile_stories_doodling': 'g': '', 'p': ,
'show_copy_link': 'g': '', 'p': ,
'mobile_logout': 'g': '', 'p': ,
'p_edit': 'g': '', 'p': ,
'404_as_react': 'g': '', 'p': ,
'acc_recovery': 'g': '', 'p': ,
'collections': 'g': '', 'p': ,
'comment_ta': 'g': '', 'p': ,
'su': 'g': '', 'p': ,
'disc_ppl': 'g': '', 'p': ,
'ebd_ul': 'g': 'launch', 'p': 'is_enabled': 'true',
'ebdsim_li': 'g': '', 'p': ,
'ebdsim_lo': 'g': '', 'p': ,
'empty_feed': 'g': '', 'p': ,
'bundles': 'g': '', 'p': ,
'exit_story_creation': 'g': '', 'p': ,
'appsell': 'g': '', 'p': ,
'imgopt': 'g': '', 'p': ,
'follow_button': 'g': '', 'p': ,
'loggedout': 'g': '', 'p': ,
'loggedout_upsell': 'g': 'control_without_new_loggedout_upsell_content_03_15_18',
'p': 'has_new_loggedout_upsell_content': 'false',
'msisdn': 'g': '', 'p': ,
'bg_sync': 'g': '', 'p': ,
'onetaplogin': 'g': '', 'p': ,
'login_poe': 'g': '', 'p': ,
'private_lo': 'g': '', 'p': ,
'profile_tabs': 'g': '', 'p': ,
'push_notifications': 'g': '', 'p': ,
'reg': 'g': '', 'p': ,
'reg_vp': 'g': 'test_group_1', 'p': 'hide_value_prop': 'true',
'report_media': 'g': '', 'p': ,
'report_profile': 'g': '', 'p': ,
'scroll_log': 'g': '', 'p': ,
'sidecar_swipe': 'g': '', 'p': ,
'su_universe': 'g': '', 'p': ,
'stale': 'g': '', 'p': ,
'stories_lo': 'g': 'test_05_01', 'p': 'location': 'true',
'stories': 'g': '', 'p': ,
'tp_pblshr': 'g': '', 'p': ,
'video': 'g': '', 'p': ,
'gdpr_eu_tos': 'g': 'control_05_01',
'p': 'gdpr_required': 'true',
'eu_new_user_flow': 'age_two_button',
'tos_version': 'eu',
'gdpr_row_tos': 'g': '', 'p': ,
'fd_gr': 'g': '', 'p': ,
'felix': 'g': '', 'p': ,
'felix_clear_fb_cookie': 'g': '', 'p': ,
'felix_creation_duration_limits': 'g': '', 'p': ,
'felix_creation_enabled': 'g': '', 'p': ,
'felix_creation_fb_crossposting': 'g': '', 'p': ,
'felix_creation_fb_crossposting_v2': 'g': '', 'p': ,
'felix_creation_validation': 'g': '', 'p': ,
'felix_creation_video_upload': 'g': '', 'p': ,
'felix_early_onboarding': 'g': '', 'p': ,
'unfollow_confirm': 'g': '', 'p': ,
'profile_enhance_li': 'g': '', 'p': ,
'profile_enhance_lo': 'g': '', 'p': ,
'phone_confirm': 'g': '', 'p': ,
'comment_enhance': 'g': '', 'p': ,
'mweb_topical_explore': 'g': '', 'p': ,
'web_nametag': 'g': '', 'p': ,
'image_downgrade': 'g': '', 'p': ,
'image_downgrade_lite': 'g': '', 'p': ,
'follow_all_fb': 'g': '', 'p': ,
'lite_direct_upsell': 'g': '', 'p': ,
'web_loggedout_noop': 'g': '', 'p': ,
'stories_video_preload': 'g': '', 'p': ,
'lite_stories_video_preload': 'g': '', 'p': ,
'a2hs_heuristic_uc': 'g': '', 'p': ,
'a2hs_heuristic_non_uc': 'g': '', 'p': ,
'web_hashtag': 'g': '', 'p': ,
'header_scroll': 'g': '', 'p': ,
'rout': 'g': '', 'p': ,
'websr': 'g': '', 'p': ,
'web_lo_follow': 'g': '', 'p': ,
'web_share': 'g': '', 'p': ,
'lite_rating': 'g': '', 'p': ,
'web_embeds_share': 'g': '', 'p': ,
'web_share_lo': 'g': '', 'p': ,
'web_embeds_logged_out': 'g': 'test_comment_input',
'p': 'show_comment_input': 'true',
'sl': 'g': '', 'p': ,
'reg_nux': 'g': '', 'p': ,
'web_datasaver_mode': 'g': '', 'p': ,
'lite_datasaver_mode': 'g': '', 'p': ,
'lite_video_upload': 'g': '', 'p': ,
'hostname': 'www.instagram.com',
'platform': 'web',
'rhx_gis': 'b9d7a25d3e0772990918069a0652bc21',
'nonce': 'E+077618aJD12ZjcMWUynA==',
'zero_data': ,
'rollout_hash': '2502ae2429f4',
'bundle_variant': 'base',
'probably_has_app': False
to convert it to json it got me error ` File "search.py", line 39, in <module> get_url() File "search.py", line 16, in get_url data = json.loads(text_data) File "/usr/lib/python3.6/json/__init__.py", line 354, in loads return _default_decoder.decode(s) File "/usr/lib/python3.6/json/decoder.py", line 342, in decode raise JSONDecodeError("Extra data", s, end) json.decoder.JSONDecodeError: Extra data: line 1 column 10631 (char 10630)`
– Andy Peterson
Sep 8 '18 at 9:55
@Andy did you include the
.rstrip(';')
?– Jon Clements♦
Sep 8 '18 at 9:56
.rstrip(';')
` data = json.loads(text_data).rstrip(';') ` i add here but not work .
– Andy Peterson
Sep 8 '18 at 10:00
@Andy the line should be as per in this answer which is
data = json.loads(text_data.rstrip(';'))
:)– Jon Clements♦
Sep 8 '18 at 10:01
data = json.loads(text_data.rstrip(';'))
@Andy okay... copy and paste this answer, add the
url = ...
to the address you're using and then run it from scratch - you should be fine...– Jon Clements♦
Sep 8 '18 at 10:06
url = ...
Thanks for contributing an answer to Stack Overflow!
But avoid …
To learn more, see our tips on writing great answers.
Required, but never shown
Required, but never shown
By clicking "Post Your Answer", you acknowledge that you have read our updated terms of service, privacy policy and cookie policy, and that your continued use of the website is subject to these policies.
Hi, please show your code, and explain which part of does not work for you.
– zvone
Sep 8 '18 at 9:20