RetroSearch Browse

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Showing content from https://huggingface.co/datasets/dutch_social/blob/main/dutch_social.py below:

dutch_social.py · corona-tweet/dutch_social at main

Update files from the datasets library (from 1.6.1)

193bf48 over 3 years ago """DUTCH SOCIAL: Annotated Covid19 tweets in Dutch language (sentiment, industry codes & province).""" import json import os import datasets _CITATION = """\ @data{FK2/MTPTL7_2020, author = {Gupta, Aakash}, publisher = {COVID-19 Data Hub}, title = {{Dutch social media collection}}, year = {2020}, version = {DRAFT VERSION}, doi = {10.5072/FK2/MTPTL7}, url = {https://doi.org/10.5072/FK2/MTPTL7} } """ _DESCRIPTION = """\ The dataset contains around 271,342 tweets. The tweets are filtered via the official Twitter API to contain tweets in Dutch language or by users who have specified their location information within Netherlands geographical boundaries. Using natural language processing we have classified the tweets for their HISCO codes. If the user has provided their location within Dutch boundaries, we have also classified them to their respective provinces The objective of this dataset is to make research data available publicly in a FAIR (Findable, Accessible, Interoperable, Reusable) way. Twitter's Terms of Service Licensed under Attribution-NonCommercial 4.0 International (CC BY-NC 4.0) (2020-10-27) """ _HOMEPAGE = "http://datasets.coronawhy.org/dataset.xhtml?persistentId=doi:10.5072/FK2/MTPTL7" _LICENSE = "CC BY-NC 4.0" _URLs = {"dutch_social": "https://storage.googleapis.com/corona-tweet/dutch-tweets.zip"} _LANG = ["nl", "en"] class DutchSocial(datasets.GeneratorBasedBuilder): """ Annotated Covid19 tweets in Dutch language. The tweets were filtered for users who had indicated their location within Netherlands or if the tweets were in Dutch language. The purpose of curating these tweets is to measure the economic impact of the Covid19 pandemic """ VERSION = datasets.Version("1.1.0") BUILDER_CONFIGS = [ datasets.BuilderConfig( name="dutch_social", version=VERSION, description="This part of my dataset provides config for the entire dataset", ) ] def _info(self): features = datasets.Features( { "full_text": datasets.Value("string"), "text_translation": datasets.Value("string"), "screen_name": datasets.Value("string"), "description": datasets.Value("string"), "desc_translation": datasets.Value("string"), "location": datasets.Value("string"), "weekofyear": datasets.Value("int64"), "weekday": datasets.Value("int64"), "month": datasets.Value("int64"), "year": datasets.Value("int64"), "day": datasets.Value("int64"), "point_info": datasets.Value("string"), "point": datasets.Value("string"), "latitude": datasets.Value("float64"), "longitude": datasets.Value("float64"), "altitude": datasets.Value("float64"), "province": datasets.Value("string"), "hisco_standard": datasets.Value("string"), "hisco_code": datasets.Value("string"), "industry": datasets.Value("bool_"), "sentiment_pattern": datasets.Value("float64"), "subjective_pattern": datasets.Value("float64"), "label": datasets.ClassLabel(num_classes=3, names=["neg", "neu", "pos"], names_file=None, id=None), } ) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" my_urls = _URLs[self.config.name] data_dir = dl_manager.download_and_extract(my_urls) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "filepath": os.path.join(data_dir, "train.jsonl"), "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={"filepath": os.path.join(data_dir, "test.jsonl"), "split": "test"}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "filepath": os.path.join(data_dir, "dev.jsonl"), "split": "dev", }, ), ] def _generate_examples(self, filepath, split, key=None): """Yields examples.""" with open(filepath, encoding="utf-8") as f: for id_, data in enumerate(f): data = json.loads(data) yield id_, { "full_text": "" if not isinstance(data["full_text"], str) else data["full_text"], "text_translation": "" if not isinstance(data["text_translation"], str) else data["text_translation"], "screen_name": "" if not isinstance(data["screen_name"], str) else data["screen_name"], "description": "" if not isinstance(data["description"], str) else data["description"], "desc_translation": "" if not isinstance(data["desc_translation"], str) else data["desc_translation"], "location": "" if not isinstance(data["location"], str) else data["location"], "weekofyear": -1 if data["weekofyear"] is None else data["weekofyear"], "weekday": -1 if data["weekday"] is None else data["weekday"], "month": -1 if data["month"] is None else data["month"], "year": -1 if data["year"] is None else data["year"], "day": -1 if data["day"] is None else data["day"], "point_info": "" if isinstance(data["point_info"], str) else data["point_info"], "point": "" if not isinstance(data["point"], str) else data["point"], "latitude": -1 if data["latitude"] is None else data["latitude"], "longitude": -1 if data["longitude"] is None else data["longitude"], "altitude": -1 if data["altitude"] is None else data["altitude"], "province": "" if not isinstance(data["province"], str) else data["province"], "hisco_standard": "" if not isinstance(data["hisco_standard"], str) else data["hisco_standard"], "hisco_code": "" if not isinstance(data["hisco_code"], str) else data["hisco_code"], "industry": False if not isinstance(data["industry"], bool) else data["industry"], "sentiment_pattern": -100 if data["sentiment_pattern"] is None else data["sentiment_pattern"], "subjective_pattern": -1 if data["subjective_pattern"] is None else data["subjective_pattern"], "label": data["label"], }

RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4